Merge branch 'main' into adil/add_acm_demo

2026-06-17 15:25:17 +02:00 · 2025-01-08 16:55:07 -08:00 · 2025-01-08 16:55:07 -08:00 · 68097fde07
commit 68097fde07
parent 285a66fdb6 dae6239b81
166 changed files with 9507 additions and 11803 deletions
--- a/.github/workflows/e2e_archgw.yml
+++ b/.github/workflows/e2e_archgw.yml
@ -0,0 +1,59 @@
+name: e2e archgw tests
+
+on:
+  push:
+    branches:
+      - main
+  pull_request:
+
+jobs:
+  test:
+    runs-on: ubuntu-latest-m
+    defaults:
+      run:
+        working-directory: ./tests/archgw
+
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v3
+
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: "3.12"
+
+      - name: build arch docker image
+        run: |
+          cd ../../ && docker build  -f arch/Dockerfile . -t katanemo/archgw
+
+      - name: start archgw
+        env:
+          OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
+          MISTRAL_API_KEY: ${{ secrets.MISTRAL_API_KEY }}
+        run: |
+          docker compose up | tee &> archgw.logs &
+
+      - name: wait for archgw to be healthy
+        run: |
+          source common.sh && wait_for_healthz http://localhost:10000/healthz
+
+      - name: install poetry
+        run: |
+          export POETRY_VERSION=1.8.5
+          curl -sSL https://install.python-poetry.org | python3 -
+          export PATH="$HOME/.local/bin:$PATH"
+
+      - name: install test dependencies
+        run: |
+          poetry install
+
+      - name: run archgw tests
+        run: |
+          poetry run pytest || tail -100 archgw.logs
+
+      - name: stop archgw docker container
+        env:
+          OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
+          MISTRAL_API_KEY: ${{ secrets.MISTRAL_API_KEY }}
+        run: |
+          docker compose down
--- a/.github/workflows/e2e_model_server.yml
+++ b/.github/workflows/e2e_model_server.yml
@ -0,0 +1,41 @@
+name: e2e model server tests
+
+on:
+  push:
+    branches:
+      - main
+  pull_request:
+
+jobs:
+  test:
+    runs-on: ubuntu-latest-m
+    defaults:
+      run:
+        working-directory: ./tests/modelserver
+
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v3
+
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: "3.12"
+
+      - name: install poetry
+        run: |
+          export POETRY_VERSION=1.8.5
+          curl -sSL https://install.python-poetry.org | python3 -
+          export PATH="$HOME/.local/bin:$PATH"
+
+      - name: install model server and start it
+        run: |
+          cd ../../model_server/ && poetry install && poetry run archgw_modelserver start
+
+      - name: install test dependencies
+        run: |
+          poetry install
+
+      - name: run tests
+        run: |
+          poetry run pytest
--- a/.github/workflows/e2e_test_demos.yml
+++ b/.github/workflows/e2e_test_demos.yml
@ -0,0 +1,48 @@
+name: e2e demo tests
+
+on:
+  push:
+    branches:
+      - main
+  pull_request:
+
+jobs:
+  test:
+    runs-on: ubuntu-latest-m
+
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v3
+
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: "3.12"
+
+      - name: build arch docker image
+        run: |
+          docker build  -f arch/Dockerfile . -t katanemo/archgw
+
+      - name: install poetry
+        run: |
+          export POETRY_VERSION=1.8.5
+          curl -sSL https://install.python-poetry.org | python3 -
+
+      - name: setup python venv
+        run: |
+          python -m venv venv
+
+      - name: install model server, arch gateway and test dependencies
+        run: |
+          source venv/bin/activate
+          cd model_server/ && echo "installing model server" && poetry install
+          cd ../arch/tools && echo "installing archgw cli" && poetry install
+          cd ../../demos/test_runner && echo "installing test dependencies" && poetry install
+
+      - name: run demo tests
+        env:
+          OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
+          MISTRAL_API_KEY: ${{ secrets.MISTRAL_API_KEY }}
+        run: |
+          source venv/bin/activate
+          cd demos/test_runner && sh run_demo_tests.sh
--- a/.github/workflows/e2e_tests.yml
+++ b/.github/workflows/e2e_tests.yml
@ -8,8 +8,7 @@ on:

 jobs:
  test:
-    runs-on: ubuntu-latest-m
-    # runs-on: gh-large-150gb-ssd
+    runs-on: ubuntu-latest

    steps:
      - name: Checkout code
@ -22,6 +21,7 @@ jobs:

      - name: Install Poetry
        run: |
+          export POETRY_VERSION=1.8.5
          curl -sSL https://install.python-poetry.org | python3 -
          export PATH="$HOME/.local/bin:$PATH"

@ -31,4 +31,4 @@ jobs:
          MISTRAL_API_KEY: ${{ secrets.MISTRAL_API_KEY }}
        run: |
          python -mvenv venv
-          source venv/bin/activate && cd e2e_tests && bash run_e2e_tests.sh
+          source venv/bin/activate && cd tests/e2e && bash run_e2e_tests.sh
--- a/.github/workflows/model-server-tests.yml
+++ b/.github/workflows/model-server-tests.yml
@ -26,6 +26,7 @@ jobs:
      # Step 3: Install Poetry
      - name: Install Poetry
        run: |
+          export POETRY_VERSION=1.8.5
          curl -sSL https://install.python-poetry.org | python3 -
          export PATH="$HOME/.local/bin:$PATH"

@ -41,4 +42,4 @@ jobs:
          PYTHONPATH: model_server  # Ensure the app's path is available
        run: |
          cd model_server
-          poetry run pytest --maxfail=5 --disable-warnings
+          poetry run pytest
--- a/.gitignore
+++ b/.gitignore
@ -1,35 +1,142 @@
-arch/qdrant_data/
-/venv/
-__pycache__
-grafana-data
-prom_data
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+.hypothesis/
+.pytest_cache/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+*.sqlite3
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# pyenv
+.python-version
+
+# celery beat schedule file
+celerybeat-schedule
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
 .env
-qdrant_data
-generated
-.DS_Store
-*.gguf
-venv
-demos/function_calling/ollama/models/
-demos/function_calling/ollama/id_ed*
-docs/build/
-demos/function_calling/open-webui/
-demos/employee_details_copilot/open-webui/
-demos/employee_details_copilot_arch/open-webui/
-demos/network_copilot/open-webui/
-demos/employee_details_copilot/ollama/models/
-demos/employee_details_copilot_arch/ollama/models/
-demos/network_copilot/ollama/models/
-arch_log/
-arch/tools/*.egg-info
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+
+# VSCode stuff:
+.vscode/
+
+# MacOS Metadata
+*.DS_Store
+
+
+
+# =========================================
+
+# Arch
 arch/tools/config
 arch/tools/build
-model_server/model_server.egg-info
+
+# Archgw - model_server
 model_server/venv_model_server
 model_server/build
 model_server/dist
+
+# Archgw - Docs
+docs/build/
+
+# Archgw - Demos
+demos/function_calling/ollama/models/
+demos/function_calling/ollama/id_ed*
+demos/function_calling/open-webui/
+demos/function_calling/open-webui/
+demos/shared/signoz/data
+
+# Arch - Miscellaneous
+grafana-data
+prom_data
+arch_log/
 arch_logs/
-dist/
 crates/*/target/
 crates/target/
 build.log
-demos/shared/signoz/data
+
+archgw.log
--- a/README.md
+++ b/README.md
@ -57,7 +57,7 @@ Arch's CLI allows you to manage and interact with the Arch gateway efficiently.
 ```console
 $ python -m venv venv
 $ source venv/bin/activate   # On Windows, use: venv\Scripts\activate
-$ pip install archgw==0.1.6
+$ pip install archgw==0.1.7
 ```

 ### Build AI Agent with Arch Gateway
--- a/arch/arch_config_schema.yaml
+++ b/arch/arch_config_schema.yaml
@ -99,6 +99,8 @@ properties:
                  type: string
              in_path:
                type: boolean
+              format:
+                type: string
            additionalProperties: false
            required:
              - name
--- a/arch/docker-compose.dev.yaml
+++ b/arch/docker-compose.dev.yaml
@ -22,3 +22,4 @@ services:
      - OPENAI_API_KEY=${OPENAI_API_KEY:?error}
      - MISTRAL_API_KEY=${MISTRAL_API_KEY:?error}
      - OTEL_TRACING_HTTP_ENDPOINT=http://host.docker.internal:4318/v1/traces
+      - MODEL_SERVER_PORT=${MODEL_SERVER_PORT:-51000}
--- a/arch/envoy.template.yaml
+++ b/arch/envoy.template.yaml
@ -1,6 +1,32 @@
 admin:
  address:
    socket_address: { address: 0.0.0.0, port_value: 9901 }
+
+stats_config:
+  histogram_bucket_settings:
+    match:
+      exact: "wasmcustom.time_to_first_token"
+    buckets:
+      - 100
+      - 500
+      - 800
+      - 1000
+      - 1200
+      - 1400
+      - 1600
+      - 1800
+      - 2000
+      - 2200
+      - 2400
+      - 3000
+      - 3500
+      - 4000
+      - 4500
+      - 5000
+      - 6000
+      - 10000
+      - 60000
+      - 180000
 static_resources:
  listeners:
    - name: arch_listener_http
@ -211,8 +237,7 @@ static_resources:
                      domains:
                        - "*"
                      routes:
-
-                        {% for internal_clustrer in ["embeddings", "zeroshot", "guard", "arch_fc", "hallucination"] %}
+                        {% for internal_clustrer in ["arch_fc", "model_server"] %}
                        - match:
                            prefix: "/"
                            headers:
@ -225,16 +250,16 @@ static_resources:
                            timeout: 60s
                        {% endfor %}

-                        {% for _, cluster in arch_clusters.items() %}
+                        {% for cluster_name, cluster in arch_clusters.items() %}
                        - match:
                            prefix: "/"
                            headers:
                              - name: "x-arch-upstream"
                                string_match:
-                                  exact: {{ cluster.name }}
+                                  exact: {{ cluster_name }}
                          route:
                            auto_host_rewrite: true
-                            cluster: {{ cluster.name }}
+                            cluster: {{ cluster_name }}
                            timeout: 60s
                        {% endfor %}
                http_filters:
@ -449,7 +474,7 @@ static_resources:
        typed_config:
          "@type": type.googleapis.com/envoy.extensions.transport_sockets.tls.v3.UpstreamTlsContext
          sni: api.mistral.ai
-    {% for internal_clustrer in ["embeddings", "zeroshot", "guard", "arch_fc", "hallucination"] %}
+    {% for internal_clustrer in ["arch_fc", "model_server"] %}
    - name: {{ internal_clustrer }}
      connect_timeout: 5s
      type: STRICT_DNS
@ -463,7 +488,7 @@ static_resources:
                  address:
                    socket_address:
                      address: host.docker.internal
-                      port_value: 51000
+                      port_value: $MODEL_SERVER_PORT
                  hostname: {{ internal_clustrer }}
    {% endfor %}
    - name: mistral_7b_instruct
@ -481,8 +506,8 @@ static_resources:
                      address: mistral_7b_instruct
                      port_value: 10001
                  hostname: "mistral_7b_instruct"
-{% for _, cluster in arch_clusters.items() %}
-    - name: {{ cluster.name }}
+{% for cluster_name, cluster in arch_clusters.items() %}
+    - name: {{ cluster_name }}
      {% if cluster.connect_timeout -%}
      connect_timeout: {{ cluster.connect_timeout }}
      {% else -%}
@ -492,7 +517,7 @@ static_resources:
      dns_lookup_family: V4_ONLY
      lb_policy: ROUND_ROBIN
      load_assignment:
-        cluster_name: {{ cluster.name }}
+        cluster_name: {{ cluster_name }}
        endpoints:
          - lb_endpoints:
              - endpoint:
--- a/arch/tools/README.md
+++ b/arch/tools/README.md
@ -19,7 +19,7 @@ source venv/bin/activate

 ### Step 3: Run the build script
 ```bash
-pip install archgw==0.1.6
+pip install archgw==0.1.7
 ```

 ## Uninstall Instructions: archgw CLI
--- a/arch/tools/cli/config_generator.py
+++ b/arch/tools/cli/config_generator.py
@ -1,3 +1,4 @@
+import json
 import os
 from jinja2 import Environment, FileSystemLoader
 import yaml
@ -47,32 +48,27 @@ def validate_and_render_schema():
    config_schema_yaml = yaml.safe_load(arch_config_schema)
    inferred_clusters = {}

+    endpoints = config_yaml.get("endpoints", {})
+
+    # override the inferred clusters with the ones defined in the config
+    for name, endpoint_details in endpoints.items():
+        inferred_clusters[name] = endpoint_details
+        endpoint = inferred_clusters[name]["endpoint"]
+        if len(endpoint.split(":")) > 1:
+            inferred_clusters[name]["endpoint"] = endpoint.split(":")[0]
+            inferred_clusters[name]["port"] = int(endpoint.split(":")[1])
+
+    print("defined clusters from arch_config.yaml: ", json.dumps(inferred_clusters))
+
    if "prompt_targets" in config_yaml:
        for prompt_target in config_yaml["prompt_targets"]:
            name = prompt_target.get("endpoint", {}).get("name", None)
            if not name:
                continue
            if name not in inferred_clusters:
-                inferred_clusters[name] = {
-                    "name": name,
-                    "port": 80,  # default port
-                }
-
-    endpoints = config_yaml.get("endpoints", {})
-
-    # override the inferred clusters with the ones defined in the config
-    for name, endpoint_details in endpoints.items():
-        if name in inferred_clusters:
-            print("updating cluster", endpoint_details)
-            inferred_clusters[name].update(endpoint_details)
-            endpoint = inferred_clusters[name]["endpoint"]
-            if len(endpoint.split(":")) > 1:
-                inferred_clusters[name]["endpoint"] = endpoint.split(":")[0]
-                inferred_clusters[name]["port"] = int(endpoint.split(":")[1])
-        else:
-            inferred_clusters[name] = endpoint_details
-
-    print("updated clusters", inferred_clusters)
+                raise Exception(
+                    f"Unknown endpoint {name}, please add it in endpoints section in your arch_config.yaml file"
+                )

    arch_llm_providers = config_yaml["llm_providers"]
    arch_tracing = config_yaml.get("tracing", {})
@ -90,6 +86,7 @@ def validate_and_render_schema():

    rendered = template.render(data)
    print(ENVOY_CONFIG_FILE_RENDERED)
+    print(rendered)
    with open(ENVOY_CONFIG_FILE_RENDERED, "w") as file:
        file.write(rendered)

--- a/arch/tools/cli/consts.py
+++ b/arch/tools/cli/consts.py
@ -1,8 +1,6 @@
 KATANEMO_DOCKERHUB_REPO = "katanemo/archgw"
 KATANEMO_LOCAL_MODEL_LIST = [
-    "katanemo/Arch-Guard-cpu",
    "katanemo/Arch-Guard",
-    "katanemo/bge-large-en-v1.5",
 ]
 SERVICE_NAME_ARCHGW = "archgw"
 SERVICE_NAME_MODEL_SERVER = "model_server"
--- a/arch/tools/cli/core.py
+++ b/arch/tools/cli/core.py
@ -4,7 +4,8 @@ import time
 import sys
 import glob
 import docker
-from cli.utils import getLogger
+from docker.errors import DockerException
+from cli.utils import getLogger, update_docker_host_env
 from cli.consts import (
    ARCHGW_DOCKER_IMAGE,
    ARCHGW_DOCKER_NAME,
@ -44,6 +45,7 @@ def start_archgw_docker(client, arch_config_file, env):
        },
        environment={
            "OTEL_TRACING_HTTP_ENDPOINT": "http://host.docker.internal:4318/v1/traces",
+            "MODEL_SERVER_PORT": os.getenv("MODEL_SERVER_PORT", "51000"),
            **env,
        },
        extra_hosts={"host.docker.internal": "host-gateway"},
@ -78,25 +80,6 @@ def stream_gateway_logs(follow):
        log.info(f"Failed to stream logs: {str(e)}")


-def stream_model_server_logs(follow):
-    """
-    Get the model server logs, check if the user wants to follow/tail them.
-    """
-    log_file_expanded = os.path.expanduser(MODEL_SERVER_LOG_FILE)
-
-    stream_command = ["tail"]
-    if follow:
-        stream_command.append("-f")
-
-    stream_command.append(log_file_expanded)
-    subprocess.run(
-        stream_command,
-        check=True,
-        stdout=sys.stdout,
-        stderr=sys.stderr,
-    )
-
-
 def stream_access_logs(follow):
    """
    Get the archgw access logs
@ -117,7 +100,7 @@ def stream_access_logs(follow):
    )


-def start_arch(arch_config_file, env, log_timeout=120):
+def start_arch(arch_config_file, env, log_timeout=120, foreground=False):
    """
    Start Docker Compose in detached mode and stream logs until services are healthy.

@ -128,7 +111,22 @@ def start_arch(arch_config_file, env, log_timeout=120):
    log.info("Starting arch gateway")

    try:
-        client = docker.from_env()
+        try:
+            client = docker.from_env()
+        except DockerException as e:
+            # try setting up the docker host environment variable and retry
+            update_docker_host_env()
+            client = docker.from_env()
+
+        try:
+            container = client.containers.get("archgw")
+            log.info("archgw container found in docker, stopping and removing it")
+            # ensure that previous docker container is stopped and removed
+            container.stop()
+            container.remove()
+            log.info("Stopped and removed archgw container")
+        except docker.errors.NotFound as e:
+            pass

        container = start_archgw_docker(client, arch_config_file, env)

@ -153,6 +151,13 @@ def start_arch(arch_config_file, env, log_timeout=120):
                log.info(f"Container health status: {container_status}")
                time.sleep(1)

+        if foreground:
+            for line in container.logs(stream=True):
+                print(line.decode("utf-8").strip("\n"))
+
+    except KeyboardInterrupt:
+        log.info("Keyboard interrupt received, stopping arch gateway service.")
+        stop_arch()
    except docker.errors.APIError as e:
        log.info(f"Failed to start Arch: {str(e)}")

@ -186,17 +191,23 @@ def download_models_from_hf():
        snapshot_download(repo_id=model)


-def start_arch_modelserver():
+def start_arch_modelserver(foreground):
    """
    Start the model server. This assumes that the archgw_modelserver package is installed locally

    """
    try:
        log.info("archgw_modelserver restart")
-        subprocess.run(
-            ["archgw_modelserver", "restart"], check=True, start_new_session=True
-        )
-        log.info("Successfully ran model_server")
+        if foreground:
+            subprocess.run(
+                ["archgw_modelserver", "start", "--foreground"],
+                check=True,
+            )
+        else:
+            subprocess.run(
+                ["archgw_modelserver", "start"],
+                check=True,
+            )
    except subprocess.CalledProcessError as e:
        log.info(f"Failed to start model_server. Please check archgw_modelserver logs")
        sys.exit(1)
@ -212,7 +223,6 @@ def stop_arch_modelserver():
            ["archgw_modelserver", "stop"],
            check=True,
        )
-        log.info("Successfully stopped the archgw model_server")
    except subprocess.CalledProcessError as e:
        log.info(f"Failed to start model_server. Please check archgw_modelserver logs")
        sys.exit(1)
--- a/arch/tools/cli/main.py
+++ b/arch/tools/cli/main.py
@ -16,10 +16,9 @@ from cli.core import (
    stop_arch_modelserver,
    start_arch,
    stop_arch,
-    stream_gateway_logs,
-    stream_model_server_logs,
-    stream_access_logs,
    download_models_from_hf,
+    stream_access_logs,
+    stream_gateway_logs,
 )
 from cli.consts import (
    KATANEMO_DOCKERHUB_REPO,
@ -138,16 +137,27 @@ def build(service):
    default=SERVICE_ALL,
    help="Service to start. Options are model_server, archgw.",
 )
-def up(file, path, service):
+@click.option(
+    "--foreground",
+    default=False,
+    help="Run Arch in the foreground. Default is False",
+    is_flag=True,
+)
+def up(file, path, service, foreground):
    """Starts Arch."""
    if service not in [SERVICE_NAME_ARCHGW, SERVICE_NAME_MODEL_SERVER, SERVICE_ALL]:
        log.info(f"Error: Invalid service {service}. Exiting")
        sys.exit(1)

+    if service == SERVICE_ALL and foreground:
+        # foreground can only be specified when starting individual services
+        log.info("foreground flag is only supported for individual services. Exiting.")
+        sys.exit(1)
+
    if service == SERVICE_NAME_MODEL_SERVER:
        log.info("Download archgw models from HuggingFace...")
        download_models_from_hf()
-        start_arch_modelserver()
+        start_arch_modelserver(foreground)
        return

    if file:
@ -214,12 +224,11 @@ def up(file, path, service):
    env.update(env_stage)

    if service == SERVICE_NAME_ARCHGW:
-        start_arch(arch_config_file, env)
+        start_arch(arch_config_file, env, foreground=foreground)
    else:
-        # this will used the cached versions of the models, so its safe to use everytime.
        download_models_from_hf()
-        start_arch_modelserver()
-        start_arch(arch_config_file, env)
+        start_arch_modelserver(foreground)
+        start_arch(arch_config_file, env, foreground=foreground)


@click.command()
@ -267,65 +276,37 @@ def generate_prompt_targets(file):


@click.command()
-@click.option(
-    "--service",
-    default=SERVICE_ALL,
-    help="Service to monitor. By default it will monitor both core gateway and model_server logs.",
-)
@click.option(
    "--debug",
    help="For detailed debug logs to trace calls from archgw <> model_server <> api_server, etc",
    is_flag=True,
 )
@click.option("--follow", help="Follow the logs", is_flag=True)
-def logs(service, debug, follow):
+def logs(debug, follow):
    """Stream logs from access logs services."""

-    if service not in [SERVICE_NAME_ARCHGW, SERVICE_NAME_MODEL_SERVER, SERVICE_ALL]:
-        print(f"Error: Invalid service {service}. Exiting")
-        sys.exit(1)
-
-    if debug:
-        try:
-            archgw_process = None
-            if service == SERVICE_NAME_ARCHGW or service == SERVICE_ALL:
-                archgw_process = multiprocessing.Process(
-                    target=stream_gateway_logs, args=(follow,)
-                )
-                archgw_process.start()
-
-            model_server_process = None
-            if service == SERVICE_NAME_MODEL_SERVER or service == SERVICE_ALL:
-                model_server_process = multiprocessing.Process(
-                    target=stream_model_server_logs, args=(follow,)
-                )
-                model_server_process.start()
-
-            if archgw_process:
-                archgw_process.join()
-            if model_server_process:
-                model_server_process.join()
-        except KeyboardInterrupt:
-            log.info("KeyboardInterrupt detected. Exiting.")
-            if archgw_process and archgw_process.is_alive():
-                archgw_process.terminate()
-
-            if model_server_process and model_server_process.is_alive():
-                model_server_process.terminate()
-    else:
-        try:
-            archgw_access_logs_process = None
-            archgw_access_logs_process = multiprocessing.Process(
-                target=stream_access_logs, args=(follow,)
+    archgw_process = None
+    try:
+        if debug:
+            archgw_process = multiprocessing.Process(
+                target=stream_gateway_logs, args=(follow,)
            )
-            archgw_access_logs_process.start()
+            archgw_process.start()

-            if archgw_access_logs_process:
-                archgw_access_logs_process.join()
-        except KeyboardInterrupt:
-            log.info("KeyboardInterrupt detected. Exiting.")
-            if archgw_access_logs_process.is_alive():
-                archgw_access_logs_process.terminate()
+        archgw_access_logs_process = multiprocessing.Process(
+            target=stream_access_logs, args=(follow,)
+        )
+        archgw_access_logs_process.start()
+        archgw_access_logs_process.join()
+
+        if archgw_process:
+            archgw_process.join()
+    except KeyboardInterrupt:
+        log.info("KeyboardInterrupt detected. Exiting.")
+        if archgw_access_logs_process.is_alive():
+            archgw_access_logs_process.terminate()
+        if archgw_process and archgw_process.is_alive():
+            archgw_process.terminate()


 main.add_command(up)
--- a/arch/tools/cli/utils.py
+++ b/arch/tools/cli/utils.py
@ -1,6 +1,8 @@
+import os
 import yaml
 import logging
 import docker
+from docker.errors import DockerException

 from cli.consts import ARCHGW_DOCKER_IMAGE, ARCHGW_DOCKER_NAME

@ -19,10 +21,32 @@ def getLogger(name="cli"):
 log = getLogger(__name__)


+def update_docker_host_env():
+    """
+    Update DOCKER_HOST environment variable to use the local Docker socket
+    """
+    if os.getenv("DOCKER_HOST"):
+        return
+
+    default_docker_socket = os.getenv("DEFAULT_DOCKER_SOCKET", "/var/run/docker.sock")
+    if not os.path.exists(default_docker_socket):
+        home_dir = os.getenv("HOME")
+        docker_host = f"unix://{home_dir}/.docker/run/docker.sock"
+        log.info(
+            f"Default docker socket {default_docker_socket} not found, using {docker_host}"
+        )
+        os.environ["DOCKER_HOST"] = docker_host
+
+
 def validate_schema(arch_config_file: str) -> None:
    try:
-        client = docker.from_env()
-        # Run the container with detach=True to avoid blocking main process
+        try:
+            client = docker.from_env()
+        except DockerException as e:
+            # try setting up the docker host environment variable and retry
+            update_docker_host_env()
+            client = docker.from_env()
+
        container = client.containers.run(
            image=ARCHGW_DOCKER_IMAGE,
            volumes={
--- a/arch/tools/poetry.lock
+++ b/arch/tools/poetry.lock
--- a/arch/tools/pyproject.toml
+++ b/arch/tools/pyproject.toml
@ -1,6 +1,6 @@
 [tool.poetry]
 name = "archgw"
-version = "0.1.6"
+version = "0.1.7"
 description = "Python-based CLI tool to manage Arch Gateway."
 authors = ["Katanemo Labs, Inc."]
 packages = [
@ -9,8 +9,8 @@ packages = [
 readme = "README.md"

 [tool.poetry.dependencies]
-python = ">=3.12"
-archgw_modelserver = "0.1.6"
+python = "^3.12"
+archgw_modelserver = "0.1.7"
 pyyaml = "^6.0.2"
 pydantic = "^2.10.1"
 click = "^8.1.7"
--- a/archgw.code-workspace
+++ b/archgw.code-workspace
@ -17,8 +17,16 @@
      "path": "model_server"
    },
    {
-      "name": "e2e_tests",
-      "path": "e2e_tests"
+      "name": "tests_e2e",
+      "path": "tests/e2e"
+    },
+    {
+      "name": "tests_archgw",
+      "path": "tests/archgw"
+    },
+    {
+      "name": "tests_modelserver",
+      "path": "tests/modelserver"
    },
    {
      "name": "chatbot_ui",
@ -41,6 +49,7 @@
      "eamodio.gitlens",
      "ms-python.black-formatter",
      "tamasfe.even-better-toml",
+      "esbenp.prettier-vscode",
      ]
  }
 }
--- a/crates/prompt_gateway/src/hallucination.rs
+++ b/crates/prompt_gateway/src/hallucination.rs
@ -1,7 +1,23 @@
-use common::{
-    common_types::open_ai::Message,
+use std::collections::HashMap;
+
+use crate::{
+    api::open_ai::Message,
    consts::{ARCH_MODEL_PREFIX, HALLUCINATION_TEMPLATE, USER_ROLE},
 };
+use serde::{Deserialize, Serialize};
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct HallucinationClassificationRequest {
+    pub prompt: String,
+    pub parameters: HashMap<String, String>,
+    pub model: String,
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct HallucinationClassificationResponse {
+    pub params_scores: HashMap<String, f64>,
+    pub model: String,
+}

 pub fn extract_messages_for_hallucination(messages: &[Message]) -> Vec<String> {
    let mut arch_assistant = false;
@ -42,7 +58,7 @@ pub fn extract_messages_for_hallucination(messages: &[Message]) -> Vec<String> {

 #[cfg(test)]
 mod test {
-    use common::common_types::open_ai::Message;
+    use crate::api::open_ai::Message;
    use pretty_assertions::assert_eq;

    use super::extract_messages_for_hallucination;
--- a/crates/common/src/api/mod.rs
+++ b/crates/common/src/api/mod.rs
@ -0,0 +1,4 @@
+pub mod hallucination;
+pub mod open_ai;
+pub mod prompt_guard;
+pub mod zero_shot;
--- a/crates/common/src/api/open_ai.rs
+++ b/crates/common/src/api/open_ai.rs
@ -0,0 +1,672 @@
+use crate::consts::{ARCH_FC_MODEL_NAME, ASSISTANT_ROLE};
+use serde::{ser::SerializeMap, Deserialize, Serialize};
+use serde_yaml::Value;
+use std::{
+    collections::{HashMap, VecDeque},
+    fmt::Display,
+};
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct ChatCompletionsRequest {
+    #[serde(default)]
+    pub model: String,
+    pub messages: Vec<Message>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub tools: Option<Vec<ChatCompletionTool>>,
+    #[serde(default)]
+    pub stream: bool,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub stream_options: Option<StreamOptions>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub metadata: Option<HashMap<String, String>>,
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
+pub enum ToolType {
+    #[serde(rename = "function")]
+    Function,
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct ChatCompletionTool {
+    #[serde(rename = "type")]
+    pub tool_type: ToolType,
+    pub function: FunctionDefinition,
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct FunctionDefinition {
+    pub name: String,
+    pub description: String,
+    pub parameters: FunctionParameters,
+}
+
+#[derive(Debug, Clone, Deserialize)]
+pub struct FunctionParameters {
+    pub properties: HashMap<String, FunctionParameter>,
+}
+
+impl Serialize for FunctionParameters {
+    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
+    where
+        S: serde::Serializer,
+    {
+        // select all requried parameters
+        let required: Vec<&String> = self
+            .properties
+            .iter()
+            .filter(|(_, v)| v.required.unwrap_or(false))
+            .map(|(k, _)| k)
+            .collect();
+        let mut map = serializer.serialize_map(Some(2))?;
+        map.serialize_entry("properties", &self.properties)?;
+        if !required.is_empty() {
+            map.serialize_entry("required", &required)?;
+        }
+        map.end()
+    }
+}
+
+#[derive(Debug, Clone, Deserialize)]
+pub struct FunctionParameter {
+    #[serde(rename = "type")]
+    #[serde(default = "ParameterType::string")]
+    pub parameter_type: ParameterType,
+    pub description: String,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub required: Option<bool>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    #[serde(rename = "enum")]
+    pub enum_values: Option<Vec<String>>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub default: Option<String>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub format: Option<String>,
+}
+
+impl Serialize for FunctionParameter {
+    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
+    where
+        S: serde::Serializer,
+    {
+        let mut map = serializer.serialize_map(Some(5))?;
+        map.serialize_entry("type", &self.parameter_type)?;
+        map.serialize_entry("description", &self.description)?;
+        if let Some(enum_values) = &self.enum_values {
+            map.serialize_entry("enum", enum_values)?;
+        }
+        if let Some(default) = &self.default {
+            map.serialize_entry("default", default)?;
+        }
+        if let Some(format) = &self.format {
+            map.serialize_entry("format", format)?;
+        }
+        map.end()
+    }
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
+pub enum ParameterType {
+    #[serde(rename = "int")]
+    Int,
+    #[serde(rename = "float")]
+    Float,
+    #[serde(rename = "bool")]
+    Bool,
+    #[serde(rename = "str")]
+    String,
+    #[serde(rename = "list")]
+    List,
+    #[serde(rename = "dict")]
+    Dict,
+}
+
+impl From<String> for ParameterType {
+    fn from(s: String) -> Self {
+        match s.as_str() {
+            "int" => ParameterType::Int,
+            "integer" => ParameterType::Int,
+            "float" => ParameterType::Float,
+            "bool" => ParameterType::Bool,
+            "boolean" => ParameterType::Bool,
+            "str" => ParameterType::String,
+            "string" => ParameterType::String,
+            "list" => ParameterType::List,
+            "array" => ParameterType::List,
+            "dict" => ParameterType::Dict,
+            "dictionary" => ParameterType::Dict,
+            _ => ParameterType::String,
+        }
+    }
+}
+
+impl ParameterType {
+    pub fn string() -> ParameterType {
+        ParameterType::String
+    }
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct StreamOptions {
+    pub include_usage: bool,
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct Message {
+    pub role: String,
+
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub content: Option<String>,
+
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub model: Option<String>,
+
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub tool_calls: Option<Vec<ToolCall>>,
+
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub tool_call_id: Option<String>,
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct Choice {
+    pub finish_reason: Option<String>,
+    pub index: Option<usize>,
+    pub message: Message,
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct ToolCall {
+    pub id: String,
+    #[serde(rename = "type")]
+    pub tool_type: ToolType,
+    pub function: FunctionCallDetail,
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct FunctionCallDetail {
+    pub name: String,
+    pub arguments: HashMap<String, Value>,
+}
+
+#[derive(Debug, Deserialize, Serialize)]
+pub struct ToolCallState {
+    pub key: String,
+    pub message: Option<Message>,
+    pub tool_call: FunctionCallDetail,
+    pub tool_response: String,
+}
+
+#[derive(Debug, Deserialize, Serialize)]
+#[serde(untagged)]
+pub enum ArchState {
+    ToolCall(Vec<ToolCallState>),
+}
+#[derive(Deserialize, Serialize)]
+#[serde(untagged)]
+pub enum ModelServerResponse {
+    ChatCompletionsResponse(ChatCompletionsResponse),
+    ModelServerErrorResponse(ModelServerErrorResponse),
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct ModelServerErrorResponse {
+    pub result: String,
+    pub intent_latency: f64,
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct ChatCompletionsResponse {
+    pub usage: Option<Usage>,
+    pub choices: Vec<Choice>,
+    pub model: String,
+    pub metadata: Option<HashMap<String, String>>,
+}
+
+impl ChatCompletionsResponse {
+    pub fn new(message: String) -> Self {
+        ChatCompletionsResponse {
+            choices: vec![Choice {
+                message: Message {
+                    role: ASSISTANT_ROLE.to_string(),
+                    content: Some(message),
+                    model: Some(ARCH_FC_MODEL_NAME.to_string()),
+                    tool_calls: None,
+                    tool_call_id: None,
+                },
+                index: Some(0),
+                finish_reason: Some("done".to_string()),
+            }],
+            usage: None,
+            model: ARCH_FC_MODEL_NAME.to_string(),
+            metadata: None,
+        }
+    }
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct Usage {
+    pub completion_tokens: usize,
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct ChatCompletionStreamResponse {
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub model: Option<String>,
+    pub choices: Vec<ChunkChoice>,
+}
+
+impl ChatCompletionStreamResponse {
+    pub fn new(
+        response: Option<String>,
+        role: Option<String>,
+        model: Option<String>,
+        tool_calls: Option<Vec<ToolCall>>,
+    ) -> Self {
+        ChatCompletionStreamResponse {
+            model,
+            choices: vec![ChunkChoice {
+                delta: Delta {
+                    role,
+                    content: response,
+                    tool_calls,
+                    model: None,
+                    tool_call_id: None,
+                },
+                finish_reason: None,
+            }],
+        }
+    }
+}
+
+#[derive(Debug, thiserror::Error)]
+pub enum ChatCompletionChunkResponseError {
+    #[error("failed to deserialize")]
+    Deserialization(#[from] serde_json::Error),
+    #[error("empty content in data chunk")]
+    EmptyContent,
+    #[error("no chunks present")]
+    NoChunks,
+}
+
+pub struct ChatCompletionStreamResponseServerEvents {
+    pub events: Vec<ChatCompletionStreamResponse>,
+}
+
+impl Display for ChatCompletionStreamResponseServerEvents {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        let tokens_str = self
+            .events
+            .iter()
+            .map(|response_chunk| {
+                if response_chunk.choices.is_empty() {
+                    return "".to_string();
+                }
+                response_chunk.choices[0]
+                    .delta
+                    .content
+                    .clone()
+                    .unwrap_or("".to_string())
+            })
+            .collect::<Vec<String>>()
+            .join("");
+
+        write!(f, "{}", tokens_str)
+    }
+}
+
+impl TryFrom<&str> for ChatCompletionStreamResponseServerEvents {
+    type Error = ChatCompletionChunkResponseError;
+
+    fn try_from(value: &str) -> Result<Self, Self::Error> {
+        let response_chunks: VecDeque<ChatCompletionStreamResponse> = value
+            .lines()
+            .filter(|line| line.starts_with("data: "))
+            .map(|line| line.get(6..).unwrap())
+            .filter(|data_chunk| *data_chunk != "[DONE]")
+            .map(serde_json::from_str::<ChatCompletionStreamResponse>)
+            .collect::<Result<VecDeque<ChatCompletionStreamResponse>, _>>()?;
+
+        Ok(ChatCompletionStreamResponseServerEvents {
+            events: response_chunks.into(),
+        })
+    }
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct ChunkChoice {
+    pub delta: Delta,
+    // TODO: could this be an enum?
+    pub finish_reason: Option<String>,
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct Delta {
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub role: Option<String>,
+
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub content: Option<String>,
+
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub tool_calls: Option<Vec<ToolCall>>,
+
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub model: Option<String>,
+
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub tool_call_id: Option<String>,
+}
+
+pub fn to_server_events(chunks: Vec<ChatCompletionStreamResponse>) -> String {
+    let mut response_str = String::new();
+    for chunk in chunks.iter() {
+        response_str.push_str("data: ");
+        response_str.push_str(&serde_json::to_string(&chunk).unwrap());
+        response_str.push_str("\n\n");
+    }
+    response_str
+}
+
+#[cfg(test)]
+mod test {
+    use super::{ChatCompletionStreamResponseServerEvents, Message};
+    use pretty_assertions::assert_eq;
+    use std::collections::HashMap;
+
+    const TOOL_SERIALIZED: &str = r#"{
+  "model": "gpt-3.5-turbo",
+  "messages": [
+    {
+      "role": "user",
+      "content": "What city do you want to know the weather for?"
+    }
+  ],
+  "tools": [
+    {
+      "type": "function",
+      "function": {
+        "name": "weather_forecast",
+        "description": "function to retrieve weather forecast",
+        "parameters": {
+          "properties": {
+            "city": {
+              "type": "str",
+              "description": "city for weather forecast",
+              "default": "test"
+            }
+          },
+          "required": [
+            "city"
+          ]
+        }
+      }
+    }
+  ],
+  "stream": true,
+  "stream_options": {
+    "include_usage": true
+  }
+}"#;
+
+    #[test]
+    fn test_tool_type_request() {
+        use super::{
+            ChatCompletionTool, ChatCompletionsRequest, FunctionDefinition, FunctionParameter,
+            FunctionParameters, ParameterType, StreamOptions, ToolType,
+        };
+
+        let mut properties = HashMap::new();
+        properties.insert(
+            "city".to_string(),
+            FunctionParameter {
+                parameter_type: ParameterType::String,
+                description: "city for weather forecast".to_string(),
+                required: Some(true),
+                enum_values: None,
+                default: Some("test".to_string()),
+                format: None,
+            },
+        );
+
+        let function_definition = FunctionDefinition {
+            name: "weather_forecast".to_string(),
+            description: "function to retrieve weather forecast".to_string(),
+            parameters: FunctionParameters { properties },
+        };
+
+        let chat_completions_request = ChatCompletionsRequest {
+            model: "gpt-3.5-turbo".to_string(),
+            messages: vec![Message {
+                role: "user".to_string(),
+                content: Some("What city do you want to know the weather for?".to_string()),
+                model: None,
+                tool_calls: None,
+                tool_call_id: None,
+            }],
+            tools: Some(vec![ChatCompletionTool {
+                tool_type: ToolType::Function,
+                function: function_definition,
+            }]),
+            stream: true,
+            stream_options: Some(StreamOptions {
+                include_usage: true,
+            }),
+            metadata: None,
+        };
+
+        let serialized = serde_json::to_string_pretty(&chat_completions_request).unwrap();
+        println!("{}", serialized);
+        assert_eq!(TOOL_SERIALIZED, serialized);
+    }
+
+    #[test]
+    fn test_parameter_types() {
+        use super::{FunctionParameter, ParameterType};
+
+        const PARAMETER_SERIALZIED: &str = r#"{
+  "city": {
+    "type": "str",
+    "description": "city for weather forecast",
+    "default": "test"
+  }
+}"#;
+
+        let properties = HashMap::from([(
+            "city".to_string(),
+            FunctionParameter {
+                parameter_type: ParameterType::String,
+                description: "city for weather forecast".to_string(),
+                required: Some(true),
+                enum_values: None,
+                default: Some("test".to_string()),
+                format: None,
+            },
+        )]);
+
+        let serialized = serde_json::to_string_pretty(&properties).unwrap();
+        assert_eq!(PARAMETER_SERIALZIED, serialized);
+
+        // ensure that if type is missing it is set to string
+        const PARAMETER_SERIALZIED_MISSING_TYPE: &str = r#"
+        {
+          "city": {
+            "description": "city for weather forecast"
+          }
+        }"#;
+
+        let missing_type_deserialized: HashMap<String, FunctionParameter> =
+            serde_json::from_str(PARAMETER_SERIALZIED_MISSING_TYPE).unwrap();
+        println!("{:?}", missing_type_deserialized);
+        assert_eq!(
+            missing_type_deserialized
+                .get("city")
+                .unwrap()
+                .parameter_type,
+            ParameterType::String
+        );
+    }
+
+    #[test]
+    fn stream_chunk_parse() {
+        const CHUNK_RESPONSE: &str = r#"data: {"id":"chatcmpl-ALmdmtKulBMEq3fRLbrnxJwcKOqvS","object":"chat.completion.chunk","created":1729755226,"model":"gpt-3.5-turbo-0125","system_fingerprint":null,"choices":[{"index":0,"delta":{"role":"assistant","content":"","refusal":null},"logprobs":null,"finish_reason":null}]}
+
+data: {"id":"chatcmpl-ALmdmtKulBMEq3fRLbrnxJwcKOqvS","object":"chat.completion.chunk","created":1729755226,"model":"gpt-3.5-turbo-0125","system_fingerprint":null,"choices":[{"index":0,"delta":{"content":"Hello"},"logprobs":null,"finish_reason":null}]}
+
+data: {"id":"chatcmpl-ALmdmtKulBMEq3fRLbrnxJwcKOqvS","object":"chat.completion.chunk","created":1729755226,"model":"gpt-3.5-turbo-0125","system_fingerprint":null,"choices":[{"index":0,"delta":{"content":"!"},"logprobs":null,"finish_reason":null}]}
+
+data: {"id":"chatcmpl-ALmdmtKulBMEq3fRLbrnxJwcKOqvS","object":"chat.completion.chunk","created":1729755226,"model":"gpt-3.5-turbo-0125","system_fingerprint":null,"choices":[{"index":0,"delta":{"content":" How"},"logprobs":null,"finish_reason":null}]}
+
+data: {"id":"chatcmpl-ALmdmtKulBMEq3fRLbrnxJwcKOqvS","object":"chat.completion.chunk","created":1729755226,"model":"gpt-3.5-turbo-0125","system_fingerprint":null,"choices":[{"index":0,"delta":{"content":" can"},"logprobs":null,"finish_reason":null}]}
+
+
+"#;
+
+        let sever_events =
+            ChatCompletionStreamResponseServerEvents::try_from(CHUNK_RESPONSE).unwrap();
+        assert_eq!(sever_events.events.len(), 5);
+        assert_eq!(
+            sever_events.events[0].choices[0]
+                .delta
+                .content
+                .as_ref()
+                .unwrap(),
+            ""
+        );
+        assert_eq!(
+            sever_events.events[1].choices[0]
+                .delta
+                .content
+                .as_ref()
+                .unwrap(),
+            "Hello"
+        );
+        assert_eq!(
+            sever_events.events[2].choices[0]
+                .delta
+                .content
+                .as_ref()
+                .unwrap(),
+            "!"
+        );
+        assert_eq!(
+            sever_events.events[3].choices[0]
+                .delta
+                .content
+                .as_ref()
+                .unwrap(),
+            " How"
+        );
+        assert_eq!(
+            sever_events.events[4].choices[0]
+                .delta
+                .content
+                .as_ref()
+                .unwrap(),
+            " can"
+        );
+        assert_eq!(sever_events.to_string(), "Hello! How can");
+    }
+
+    #[test]
+    fn stream_chunk_parse_done() {
+        const CHUNK_RESPONSE: &str = r#"data: {"id":"chatcmpl-ALn2KTfmrIpYd9N3Un4Kyg08WIIP6","object":"chat.completion.chunk","created":1729756748,"model":"gpt-3.5-turbo-0125","system_fingerprint":null,"choices":[{"index":0,"delta":{"content":" I"},"logprobs":null,"finish_reason":null}]}
+
+data: {"id":"chatcmpl-ALn2KTfmrIpYd9N3Un4Kyg08WIIP6","object":"chat.completion.chunk","created":1729756748,"model":"gpt-3.5-turbo-0125","system_fingerprint":null,"choices":[{"index":0,"delta":{"content":" assist"},"logprobs":null,"finish_reason":null}]}
+
+data: {"id":"chatcmpl-ALn2KTfmrIpYd9N3Un4Kyg08WIIP6","object":"chat.completion.chunk","created":1729756748,"model":"gpt-3.5-turbo-0125","system_fingerprint":null,"choices":[{"index":0,"delta":{"content":" you"},"logprobs":null,"finish_reason":null}]}
+
+data: {"id":"chatcmpl-ALn2KTfmrIpYd9N3Un4Kyg08WIIP6","object":"chat.completion.chunk","created":1729756748,"model":"gpt-3.5-turbo-0125","system_fingerprint":null,"choices":[{"index":0,"delta":{"content":" today"},"logprobs":null,"finish_reason":null}]}
+
+data: {"id":"chatcmpl-ALn2KTfmrIpYd9N3Un4Kyg08WIIP6","object":"chat.completion.chunk","created":1729756748,"model":"gpt-3.5-turbo-0125","system_fingerprint":null,"choices":[{"index":0,"delta":{"content":"?"},"logprobs":null,"finish_reason":null}]}
+
+data: {"id":"chatcmpl-ALn2KTfmrIpYd9N3Un4Kyg08WIIP6","object":"chat.completion.chunk","created":1729756748,"model":"gpt-3.5-turbo-0125","system_fingerprint":null,"choices":[{"index":0,"delta":{},"logprobs":null,"finish_reason":"stop"}]}
+
+data: [DONE]
+"#;
+
+        let sever_events: ChatCompletionStreamResponseServerEvents =
+            ChatCompletionStreamResponseServerEvents::try_from(CHUNK_RESPONSE).unwrap();
+        assert_eq!(sever_events.events.len(), 6);
+        assert_eq!(
+            sever_events.events[0].choices[0]
+                .delta
+                .content
+                .as_ref()
+                .unwrap(),
+            " I"
+        );
+        assert_eq!(
+            sever_events.events[1].choices[0]
+                .delta
+                .content
+                .as_ref()
+                .unwrap(),
+            " assist"
+        );
+        assert_eq!(
+            sever_events.events[2].choices[0]
+                .delta
+                .content
+                .as_ref()
+                .unwrap(),
+            " you"
+        );
+        assert_eq!(
+            sever_events.events[3].choices[0]
+                .delta
+                .content
+                .as_ref()
+                .unwrap(),
+            " today"
+        );
+        assert_eq!(
+            sever_events.events[4].choices[0]
+                .delta
+                .content
+                .as_ref()
+                .unwrap(),
+            "?"
+        );
+        assert_eq!(sever_events.events[5].choices[0].delta.content, None);
+
+        assert_eq!(sever_events.to_string(), " I assist you today?");
+    }
+
+    #[test]
+    fn stream_chunk_parse_mistral() {
+        const CHUNK_RESPONSE: &str = r#"data: {"id":"e1ebce16de5443b79613512c2d757936","object":"chat.completion.chunk","created":1729805261,"model":"ministral-8b-latest","choices":[{"index":0,"delta":{"role":"assistant","content":""},"finish_reason":null}]}
+
+data: {"id":"e1ebce16de5443b79613512c2d757936","object":"chat.completion.chunk","created":1729805261,"model":"ministral-8b-latest","choices":[{"index":0,"delta":{"content":"Hello"},"finish_reason":null}]}
+
+data: {"id":"e1ebce16de5443b79613512c2d757936","object":"chat.completion.chunk","created":1729805261,"model":"ministral-8b-latest","choices":[{"index":0,"delta":{"content":"!"},"finish_reason":null}]}
+
+data: {"id":"e1ebce16de5443b79613512c2d757936","object":"chat.completion.chunk","created":1729805261,"model":"ministral-8b-latest","choices":[{"index":0,"delta":{"content":" How"},"finish_reason":null}]}
+
+data: {"id":"e1ebce16de5443b79613512c2d757936","object":"chat.completion.chunk","created":1729805261,"model":"ministral-8b-latest","choices":[{"index":0,"delta":{"content":" can"},"finish_reason":null}]}
+
+data: {"id":"e1ebce16de5443b79613512c2d757936","object":"chat.completion.chunk","created":1729805261,"model":"ministral-8b-latest","choices":[{"index":0,"delta":{"content":" I"},"finish_reason":null}]}
+
+data: {"id":"e1ebce16de5443b79613512c2d757936","object":"chat.completion.chunk","created":1729805261,"model":"ministral-8b-latest","choices":[{"index":0,"delta":{"content":" assist"},"finish_reason":null}]}
+
+data: {"id":"e1ebce16de5443b79613512c2d757936","object":"chat.completion.chunk","created":1729805261,"model":"ministral-8b-latest","choices":[{"index":0,"delta":{"content":" you"},"finish_reason":null}]}
+
+data: {"id":"e1ebce16de5443b79613512c2d757936","object":"chat.completion.chunk","created":1729805261,"model":"ministral-8b-latest","choices":[{"index":0,"delta":{"content":" today"},"finish_reason":null}]}
+
+data: {"id":"e1ebce16de5443b79613512c2d757936","object":"chat.completion.chunk","created":1729805261,"model":"ministral-8b-latest","choices":[{"index":0,"delta":{"content":"?"},"finish_reason":null}]}
+
+data: {"id":"e1ebce16de5443b79613512c2d757936","object":"chat.completion.chunk","created":1729805261,"model":"ministral-8b-latest","choices":[{"index":0,"delta":{"content":""},"finish_reason":"stop"}],"usage":{"prompt_tokens":4,"total_tokens":13,"completion_tokens":9}}
+
+data: [DONE]
+"#;
+
+        let sever_events: ChatCompletionStreamResponseServerEvents =
+            ChatCompletionStreamResponseServerEvents::try_from(CHUNK_RESPONSE).unwrap();
+        assert_eq!(sever_events.events.len(), 11);
+
+        assert_eq!(
+            sever_events.to_string(),
+            "Hello! How can I assist you today?"
+        );
+    }
+}
--- a/crates/common/src/api/prompt_guard.rs
+++ b/crates/common/src/api/prompt_guard.rs
@ -0,0 +1,25 @@
+use serde::{Deserialize, Serialize};
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub enum PromptGuardTask {
+    #[serde(rename = "jailbreak")]
+    Jailbreak,
+    #[serde(rename = "toxicity")]
+    Toxicity,
+    #[serde(rename = "both")]
+    Both,
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct PromptGuardRequest {
+    pub input: String,
+    pub task: PromptGuardTask,
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct PromptGuardResponse {
+    pub toxic_prob: Option<f64>,
+    pub jailbreak_prob: Option<f64>,
+    pub toxic_verdict: Option<bool>,
+    pub jailbreak_verdict: Option<bool>,
+}
--- a/crates/common/src/api/zero_shot.rs
+++ b/crates/common/src/api/zero_shot.rs
@ -0,0 +1,18 @@
+use std::collections::HashMap;
+
+use serde::{Deserialize, Serialize};
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct ZeroShotClassificationRequest {
+    pub input: String,
+    pub labels: Vec<String>,
+    pub model: String,
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct ZeroShotClassificationResponse {
+    pub predicted_class: String,
+    pub predicted_class_score: f64,
+    pub scores: HashMap<String, f64>,
+    pub model: String,
+}
--- a/crates/common/src/common_types.rs
+++ b/crates/common/src/common_types.rs
@ -1,743 +0,0 @@
-use crate::configuration::PromptTarget;
-use serde::{Deserialize, Serialize};
-use std::collections::HashMap;
-
-#[derive(Debug, Clone, Serialize, Deserialize)]
-pub struct EmbeddingRequest {
-    pub prompt_target: PromptTarget,
-}
-
-#[derive(Debug, Clone, Copy, Hash, PartialEq, Eq)]
-pub enum EmbeddingType {
-    Name,
-    Description,
-}
-
-#[derive(Debug, Clone, Serialize, Deserialize)]
-pub struct VectorPoint {
-    pub id: String,
-    pub payload: HashMap<String, String>,
-    pub vector: Vec<f64>,
-}
-
-#[derive(Debug, Clone, Serialize, Deserialize)]
-pub struct StoreVectorEmbeddingsRequest {
-    pub points: Vec<VectorPoint>,
-}
-
-#[derive(Debug, Clone, Serialize, Deserialize)]
-pub struct SearchPointResult {
-    pub id: String,
-    pub version: i32,
-    pub score: f64,
-    pub payload: HashMap<String, String>,
-}
-
-pub mod open_ai {
-    use std::{
-        collections::{HashMap, VecDeque},
-        fmt::Display,
-    };
-
-    use serde::{ser::SerializeMap, Deserialize, Serialize};
-    use serde_yaml::Value;
-
-    use crate::consts::{ARCH_FC_MODEL_NAME, ASSISTANT_ROLE};
-
-    #[derive(Debug, Clone, Serialize, Deserialize)]
-    pub struct ChatCompletionsRequest {
-        #[serde(default)]
-        pub model: String,
-        pub messages: Vec<Message>,
-        #[serde(skip_serializing_if = "Option::is_none")]
-        pub tools: Option<Vec<ChatCompletionTool>>,
-        #[serde(default)]
-        pub stream: bool,
-        #[serde(skip_serializing_if = "Option::is_none")]
-        pub stream_options: Option<StreamOptions>,
-        #[serde(skip_serializing_if = "Option::is_none")]
-        pub metadata: Option<HashMap<String, String>>,
-    }
-
-    #[derive(Debug, Clone, Serialize, Deserialize)]
-    pub enum ToolType {
-        #[serde(rename = "function")]
-        Function,
-    }
-
-    #[derive(Debug, Clone, Serialize, Deserialize)]
-    pub struct ChatCompletionTool {
-        #[serde(rename = "type")]
-        pub tool_type: ToolType,
-        pub function: FunctionDefinition,
-    }
-
-    #[derive(Debug, Clone, Serialize, Deserialize)]
-    pub struct FunctionDefinition {
-        pub name: String,
-        pub description: String,
-        pub parameters: FunctionParameters,
-    }
-
-    #[derive(Debug, Clone, Deserialize)]
-    pub struct FunctionParameters {
-        pub properties: HashMap<String, FunctionParameter>,
-    }
-
-    impl Serialize for FunctionParameters {
-        fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
-        where
-            S: serde::Serializer,
-        {
-            // select all requried parameters
-            let required: Vec<&String> = self
-                .properties
-                .iter()
-                .filter(|(_, v)| v.required.unwrap_or(false))
-                .map(|(k, _)| k)
-                .collect();
-            let mut map = serializer.serialize_map(Some(2))?;
-            map.serialize_entry("properties", &self.properties)?;
-            if !required.is_empty() {
-                map.serialize_entry("required", &required)?;
-            }
-            map.end()
-        }
-    }
-
-    #[derive(Debug, Clone, Deserialize)]
-    pub struct FunctionParameter {
-        #[serde(rename = "type")]
-        #[serde(default = "ParameterType::string")]
-        pub parameter_type: ParameterType,
-        pub description: String,
-        #[serde(skip_serializing_if = "Option::is_none")]
-        pub required: Option<bool>,
-        #[serde(skip_serializing_if = "Option::is_none")]
-        #[serde(rename = "enum")]
-        pub enum_values: Option<Vec<String>>,
-        #[serde(skip_serializing_if = "Option::is_none")]
-        pub default: Option<String>,
-    }
-
-    impl Serialize for FunctionParameter {
-        fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
-        where
-            S: serde::Serializer,
-        {
-            let mut map = serializer.serialize_map(Some(5))?;
-            map.serialize_entry("type", &self.parameter_type)?;
-            map.serialize_entry("description", &self.description)?;
-            if let Some(enum_values) = &self.enum_values {
-                map.serialize_entry("enum", enum_values)?;
-            }
-            if let Some(default) = &self.default {
-                map.serialize_entry("default", default)?;
-            }
-            map.end()
-        }
-    }
-
-    #[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
-    pub enum ParameterType {
-        #[serde(rename = "int")]
-        Int,
-        #[serde(rename = "float")]
-        Float,
-        #[serde(rename = "bool")]
-        Bool,
-        #[serde(rename = "str")]
-        String,
-        #[serde(rename = "list")]
-        List,
-        #[serde(rename = "dict")]
-        Dict,
-    }
-
-    impl From<String> for ParameterType {
-        fn from(s: String) -> Self {
-            match s.as_str() {
-                "int" => ParameterType::Int,
-                "integer" => ParameterType::Int,
-                "float" => ParameterType::Float,
-                "bool" => ParameterType::Bool,
-                "boolean" => ParameterType::Bool,
-                "str" => ParameterType::String,
-                "string" => ParameterType::String,
-                "list" => ParameterType::List,
-                "array" => ParameterType::List,
-                "dict" => ParameterType::Dict,
-                "dictionary" => ParameterType::Dict,
-                _ => ParameterType::String,
-            }
-        }
-    }
-
-    impl ParameterType {
-        pub fn string() -> ParameterType {
-            ParameterType::String
-        }
-    }
-
-    #[derive(Debug, Clone, Serialize, Deserialize)]
-    pub struct StreamOptions {
-        pub include_usage: bool,
-    }
-
-    #[derive(Debug, Clone, Serialize, Deserialize)]
-    pub struct Message {
-        pub role: String,
-
-        #[serde(skip_serializing_if = "Option::is_none")]
-        pub content: Option<String>,
-
-        #[serde(skip_serializing_if = "Option::is_none")]
-        pub model: Option<String>,
-
-        #[serde(skip_serializing_if = "Option::is_none")]
-        pub tool_calls: Option<Vec<ToolCall>>,
-
-        #[serde(skip_serializing_if = "Option::is_none")]
-        pub tool_call_id: Option<String>,
-    }
-
-    #[derive(Debug, Clone, Serialize, Deserialize)]
-    pub struct Choice {
-        pub finish_reason: String,
-        pub index: usize,
-        pub message: Message,
-    }
-
-    #[derive(Debug, Clone, Serialize, Deserialize)]
-    pub struct ToolCall {
-        pub id: String,
-        #[serde(rename = "type")]
-        pub tool_type: ToolType,
-        pub function: FunctionCallDetail,
-    }
-
-    #[derive(Debug, Clone, Serialize, Deserialize)]
-    pub struct FunctionCallDetail {
-        pub name: String,
-        pub arguments: HashMap<String, Value>,
-    }
-
-    #[derive(Debug, Deserialize, Serialize)]
-    pub struct ToolCallState {
-        pub key: String,
-        pub message: Option<Message>,
-        pub tool_call: FunctionCallDetail,
-        pub tool_response: String,
-    }
-
-    #[derive(Debug, Deserialize, Serialize)]
-    #[serde(untagged)]
-    pub enum ArchState {
-        ToolCall(Vec<ToolCallState>),
-    }
-
-    #[derive(Debug, Clone, Serialize, Deserialize)]
-    pub struct ChatCompletionsResponse {
-        pub usage: Option<Usage>,
-        pub choices: Vec<Choice>,
-        pub model: String,
-        pub metadata: Option<HashMap<String, String>>,
-    }
-
-    impl ChatCompletionsResponse {
-        pub fn new(message: String) -> Self {
-            ChatCompletionsResponse {
-                choices: vec![Choice {
-                    message: Message {
-                        role: ASSISTANT_ROLE.to_string(),
-                        content: Some(message),
-                        model: Some(ARCH_FC_MODEL_NAME.to_string()),
-                        tool_calls: None,
-                        tool_call_id: None,
-                    },
-                    index: 0,
-                    finish_reason: "done".to_string(),
-                }],
-                usage: None,
-                model: ARCH_FC_MODEL_NAME.to_string(),
-                metadata: None,
-            }
-        }
-    }
-
-    #[derive(Debug, Clone, Serialize, Deserialize)]
-    pub struct Usage {
-        pub completion_tokens: usize,
-    }
-
-    #[derive(Debug, Clone, Serialize, Deserialize)]
-    pub struct ChatCompletionStreamResponse {
-        #[serde(skip_serializing_if = "Option::is_none")]
-        pub model: Option<String>,
-        pub choices: Vec<ChunkChoice>,
-    }
-
-    impl ChatCompletionStreamResponse {
-        pub fn new(
-            response: Option<String>,
-            role: Option<String>,
-            model: Option<String>,
-            tool_calls: Option<Vec<ToolCall>>,
-        ) -> Self {
-            ChatCompletionStreamResponse {
-                model,
-                choices: vec![ChunkChoice {
-                    delta: Delta {
-                        role,
-                        content: response,
-                        tool_calls,
-                        model: None,
-                        tool_call_id: None,
-                    },
-                    finish_reason: None,
-                }],
-            }
-        }
-    }
-
-    #[derive(Debug, thiserror::Error)]
-    pub enum ChatCompletionChunkResponseError {
-        #[error("failed to deserialize")]
-        Deserialization(#[from] serde_json::Error),
-        #[error("empty content in data chunk")]
-        EmptyContent,
-        #[error("no chunks present")]
-        NoChunks,
-    }
-
-    pub struct ChatCompletionStreamResponseServerEvents {
-        pub events: Vec<ChatCompletionStreamResponse>,
-    }
-
-    impl Display for ChatCompletionStreamResponseServerEvents {
-        fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-            let tokens_str = self
-                .events
-                .iter()
-                .map(|response_chunk| {
-                    if response_chunk.choices.is_empty() {
-                        return "".to_string();
-                    }
-                    response_chunk.choices[0]
-                        .delta
-                        .content
-                        .clone()
-                        .unwrap_or("".to_string())
-                })
-                .collect::<Vec<String>>()
-                .join("");
-
-            write!(f, "{}", tokens_str)
-        }
-    }
-
-    impl TryFrom<&str> for ChatCompletionStreamResponseServerEvents {
-        type Error = ChatCompletionChunkResponseError;
-
-        fn try_from(value: &str) -> Result<Self, Self::Error> {
-            let response_chunks: VecDeque<ChatCompletionStreamResponse> = value
-                .lines()
-                .filter(|line| line.starts_with("data: "))
-                .map(|line| line.get(6..).unwrap())
-                .filter(|data_chunk| *data_chunk != "[DONE]")
-                .map(serde_json::from_str::<ChatCompletionStreamResponse>)
-                .collect::<Result<VecDeque<ChatCompletionStreamResponse>, _>>()?;
-
-            Ok(ChatCompletionStreamResponseServerEvents {
-                events: response_chunks.into(),
-            })
-        }
-    }
-
-    #[derive(Debug, Clone, Serialize, Deserialize)]
-    pub struct ChunkChoice {
-        pub delta: Delta,
-        // TODO: could this be an enum?
-        pub finish_reason: Option<String>,
-    }
-
-    #[derive(Debug, Clone, Serialize, Deserialize)]
-    pub struct Delta {
-        #[serde(skip_serializing_if = "Option::is_none")]
-        pub role: Option<String>,
-
-        #[serde(skip_serializing_if = "Option::is_none")]
-        pub content: Option<String>,
-
-        #[serde(skip_serializing_if = "Option::is_none")]
-        pub tool_calls: Option<Vec<ToolCall>>,
-
-        #[serde(skip_serializing_if = "Option::is_none")]
-        pub model: Option<String>,
-
-        #[serde(skip_serializing_if = "Option::is_none")]
-        pub tool_call_id: Option<String>,
-    }
-
-    pub fn to_server_events(chunks: Vec<ChatCompletionStreamResponse>) -> String {
-        let mut response_str = String::new();
-        for chunk in chunks.iter() {
-            response_str.push_str("data: ");
-            response_str.push_str(&serde_json::to_string(&chunk).unwrap());
-            response_str.push_str("\n\n");
-        }
-        response_str
-    }
-}
-
-#[derive(Debug, Clone, Serialize, Deserialize)]
-pub struct ZeroShotClassificationRequest {
-    pub input: String,
-    pub labels: Vec<String>,
-    pub model: String,
-}
-
-#[derive(Debug, Clone, Serialize, Deserialize)]
-pub struct ZeroShotClassificationResponse {
-    pub predicted_class: String,
-    pub predicted_class_score: f64,
-    pub scores: HashMap<String, f64>,
-    pub model: String,
-}
-
-#[derive(Debug, Clone, Serialize, Deserialize)]
-pub struct HallucinationClassificationRequest {
-    pub prompt: String,
-    pub parameters: HashMap<String, String>,
-    pub model: String,
-}
-
-#[derive(Debug, Clone, Serialize, Deserialize)]
-pub struct HallucinationClassificationResponse {
-    pub params_scores: HashMap<String, f64>,
-    pub model: String,
-}
-
-#[derive(Debug, Clone, Serialize, Deserialize)]
-pub enum PromptGuardTask {
-    #[serde(rename = "jailbreak")]
-    Jailbreak,
-    #[serde(rename = "toxicity")]
-    Toxicity,
-    #[serde(rename = "both")]
-    Both,
-}
-
-#[derive(Debug, Clone, Serialize, Deserialize)]
-pub struct PromptGuardRequest {
-    pub input: String,
-    pub task: PromptGuardTask,
-}
-
-#[derive(Debug, Clone, Serialize, Deserialize)]
-pub struct PromptGuardResponse {
-    pub toxic_prob: Option<f64>,
-    pub jailbreak_prob: Option<f64>,
-    pub toxic_verdict: Option<bool>,
-    pub jailbreak_verdict: Option<bool>,
-}
-
-#[cfg(test)]
-mod test {
-    use crate::common_types::open_ai::{ChatCompletionStreamResponseServerEvents, Message};
-    use pretty_assertions::assert_eq;
-    use std::collections::HashMap;
-
-    const TOOL_SERIALIZED: &str = r#"{
-  "model": "gpt-3.5-turbo",
-  "messages": [
-    {
-      "role": "user",
-      "content": "What city do you want to know the weather for?"
-    }
-  ],
-  "tools": [
-    {
-      "type": "function",
-      "function": {
-        "name": "weather_forecast",
-        "description": "function to retrieve weather forecast",
-        "parameters": {
-          "properties": {
-            "city": {
-              "type": "str",
-              "description": "city for weather forecast",
-              "default": "test"
-            }
-          },
-          "required": [
-            "city"
-          ]
-        }
-      }
-    }
-  ],
-  "stream": true,
-  "stream_options": {
-    "include_usage": true
-  }
-}"#;
-
-    #[test]
-    fn test_tool_type_request() {
-        use super::open_ai::{
-            ChatCompletionsRequest, FunctionDefinition, FunctionParameter, ParameterType, ToolType,
-        };
-
-        let mut properties = HashMap::new();
-        properties.insert(
-            "city".to_string(),
-            FunctionParameter {
-                parameter_type: ParameterType::String,
-                description: "city for weather forecast".to_string(),
-                required: Some(true),
-                enum_values: None,
-                default: Some("test".to_string()),
-            },
-        );
-
-        let function_definition = FunctionDefinition {
-            name: "weather_forecast".to_string(),
-            description: "function to retrieve weather forecast".to_string(),
-            parameters: super::open_ai::FunctionParameters { properties },
-        };
-
-        let chat_completions_request = ChatCompletionsRequest {
-            model: "gpt-3.5-turbo".to_string(),
-            messages: vec![Message {
-                role: "user".to_string(),
-                content: Some("What city do you want to know the weather for?".to_string()),
-                model: None,
-                tool_calls: None,
-                tool_call_id: None,
-            }],
-            tools: Some(vec![super::open_ai::ChatCompletionTool {
-                tool_type: ToolType::Function,
-                function: function_definition,
-            }]),
-            stream: true,
-            stream_options: Some(super::open_ai::StreamOptions {
-                include_usage: true,
-            }),
-            metadata: None,
-        };
-
-        let serialized = serde_json::to_string_pretty(&chat_completions_request).unwrap();
-        println!("{}", serialized);
-        assert_eq!(TOOL_SERIALIZED, serialized);
-    }
-
-    #[test]
-    fn test_parameter_types() {
-        use super::open_ai::{FunctionParameter, ParameterType};
-
-        const PARAMETER_SERIALZIED: &str = r#"{
-  "city": {
-    "type": "str",
-    "description": "city for weather forecast",
-    "default": "test"
-  }
-}"#;
-
-        let properties = HashMap::from([(
-            "city".to_string(),
-            FunctionParameter {
-                parameter_type: ParameterType::String,
-                description: "city for weather forecast".to_string(),
-                required: Some(true),
-                enum_values: None,
-                default: Some("test".to_string()),
-            },
-        )]);
-
-        let serialized = serde_json::to_string_pretty(&properties).unwrap();
-        assert_eq!(PARAMETER_SERIALZIED, serialized);
-
-        // ensure that if type is missing it is set to string
-        const PARAMETER_SERIALZIED_MISSING_TYPE: &str = r#"
-        {
-          "city": {
-            "description": "city for weather forecast"
-          }
-        }"#;
-
-        let missing_type_deserialized: HashMap<String, FunctionParameter> =
-            serde_json::from_str(PARAMETER_SERIALZIED_MISSING_TYPE).unwrap();
-        println!("{:?}", missing_type_deserialized);
-        assert_eq!(
-            missing_type_deserialized
-                .get("city")
-                .unwrap()
-                .parameter_type,
-            ParameterType::String
-        );
-    }
-
-    #[test]
-    fn stream_chunk_parse() {
-        const CHUNK_RESPONSE: &str = r#"data: {"id":"chatcmpl-ALmdmtKulBMEq3fRLbrnxJwcKOqvS","object":"chat.completion.chunk","created":1729755226,"model":"gpt-3.5-turbo-0125","system_fingerprint":null,"choices":[{"index":0,"delta":{"role":"assistant","content":"","refusal":null},"logprobs":null,"finish_reason":null}]}
-
-data: {"id":"chatcmpl-ALmdmtKulBMEq3fRLbrnxJwcKOqvS","object":"chat.completion.chunk","created":1729755226,"model":"gpt-3.5-turbo-0125","system_fingerprint":null,"choices":[{"index":0,"delta":{"content":"Hello"},"logprobs":null,"finish_reason":null}]}
-
-data: {"id":"chatcmpl-ALmdmtKulBMEq3fRLbrnxJwcKOqvS","object":"chat.completion.chunk","created":1729755226,"model":"gpt-3.5-turbo-0125","system_fingerprint":null,"choices":[{"index":0,"delta":{"content":"!"},"logprobs":null,"finish_reason":null}]}
-
-data: {"id":"chatcmpl-ALmdmtKulBMEq3fRLbrnxJwcKOqvS","object":"chat.completion.chunk","created":1729755226,"model":"gpt-3.5-turbo-0125","system_fingerprint":null,"choices":[{"index":0,"delta":{"content":" How"},"logprobs":null,"finish_reason":null}]}
-
-data: {"id":"chatcmpl-ALmdmtKulBMEq3fRLbrnxJwcKOqvS","object":"chat.completion.chunk","created":1729755226,"model":"gpt-3.5-turbo-0125","system_fingerprint":null,"choices":[{"index":0,"delta":{"content":" can"},"logprobs":null,"finish_reason":null}]}
-
-
-"#;
-
-        let sever_events =
-            ChatCompletionStreamResponseServerEvents::try_from(CHUNK_RESPONSE).unwrap();
-        assert_eq!(sever_events.events.len(), 5);
-        assert_eq!(
-            sever_events.events[0].choices[0]
-                .delta
-                .content
-                .as_ref()
-                .unwrap(),
-            ""
-        );
-        assert_eq!(
-            sever_events.events[1].choices[0]
-                .delta
-                .content
-                .as_ref()
-                .unwrap(),
-            "Hello"
-        );
-        assert_eq!(
-            sever_events.events[2].choices[0]
-                .delta
-                .content
-                .as_ref()
-                .unwrap(),
-            "!"
-        );
-        assert_eq!(
-            sever_events.events[3].choices[0]
-                .delta
-                .content
-                .as_ref()
-                .unwrap(),
-            " How"
-        );
-        assert_eq!(
-            sever_events.events[4].choices[0]
-                .delta
-                .content
-                .as_ref()
-                .unwrap(),
-            " can"
-        );
-        assert_eq!(sever_events.to_string(), "Hello! How can");
-    }
-
-    #[test]
-    fn stream_chunk_parse_done() {
-        const CHUNK_RESPONSE: &str = r#"data: {"id":"chatcmpl-ALn2KTfmrIpYd9N3Un4Kyg08WIIP6","object":"chat.completion.chunk","created":1729756748,"model":"gpt-3.5-turbo-0125","system_fingerprint":null,"choices":[{"index":0,"delta":{"content":" I"},"logprobs":null,"finish_reason":null}]}
-
-data: {"id":"chatcmpl-ALn2KTfmrIpYd9N3Un4Kyg08WIIP6","object":"chat.completion.chunk","created":1729756748,"model":"gpt-3.5-turbo-0125","system_fingerprint":null,"choices":[{"index":0,"delta":{"content":" assist"},"logprobs":null,"finish_reason":null}]}
-
-data: {"id":"chatcmpl-ALn2KTfmrIpYd9N3Un4Kyg08WIIP6","object":"chat.completion.chunk","created":1729756748,"model":"gpt-3.5-turbo-0125","system_fingerprint":null,"choices":[{"index":0,"delta":{"content":" you"},"logprobs":null,"finish_reason":null}]}
-
-data: {"id":"chatcmpl-ALn2KTfmrIpYd9N3Un4Kyg08WIIP6","object":"chat.completion.chunk","created":1729756748,"model":"gpt-3.5-turbo-0125","system_fingerprint":null,"choices":[{"index":0,"delta":{"content":" today"},"logprobs":null,"finish_reason":null}]}
-
-data: {"id":"chatcmpl-ALn2KTfmrIpYd9N3Un4Kyg08WIIP6","object":"chat.completion.chunk","created":1729756748,"model":"gpt-3.5-turbo-0125","system_fingerprint":null,"choices":[{"index":0,"delta":{"content":"?"},"logprobs":null,"finish_reason":null}]}
-
-data: {"id":"chatcmpl-ALn2KTfmrIpYd9N3Un4Kyg08WIIP6","object":"chat.completion.chunk","created":1729756748,"model":"gpt-3.5-turbo-0125","system_fingerprint":null,"choices":[{"index":0,"delta":{},"logprobs":null,"finish_reason":"stop"}]}
-
-data: [DONE]
-"#;
-
-        let sever_events: ChatCompletionStreamResponseServerEvents =
-            ChatCompletionStreamResponseServerEvents::try_from(CHUNK_RESPONSE).unwrap();
-        assert_eq!(sever_events.events.len(), 6);
-        assert_eq!(
-            sever_events.events[0].choices[0]
-                .delta
-                .content
-                .as_ref()
-                .unwrap(),
-            " I"
-        );
-        assert_eq!(
-            sever_events.events[1].choices[0]
-                .delta
-                .content
-                .as_ref()
-                .unwrap(),
-            " assist"
-        );
-        assert_eq!(
-            sever_events.events[2].choices[0]
-                .delta
-                .content
-                .as_ref()
-                .unwrap(),
-            " you"
-        );
-        assert_eq!(
-            sever_events.events[3].choices[0]
-                .delta
-                .content
-                .as_ref()
-                .unwrap(),
-            " today"
-        );
-        assert_eq!(
-            sever_events.events[4].choices[0]
-                .delta
-                .content
-                .as_ref()
-                .unwrap(),
-            "?"
-        );
-        assert_eq!(sever_events.events[5].choices[0].delta.content, None);
-
-        assert_eq!(sever_events.to_string(), " I assist you today?");
-    }
-
-    #[test]
-    fn stream_chunk_parse_mistral() {
-        const CHUNK_RESPONSE: &str = r#"data: {"id":"e1ebce16de5443b79613512c2d757936","object":"chat.completion.chunk","created":1729805261,"model":"ministral-8b-latest","choices":[{"index":0,"delta":{"role":"assistant","content":""},"finish_reason":null}]}
-
-data: {"id":"e1ebce16de5443b79613512c2d757936","object":"chat.completion.chunk","created":1729805261,"model":"ministral-8b-latest","choices":[{"index":0,"delta":{"content":"Hello"},"finish_reason":null}]}
-
-data: {"id":"e1ebce16de5443b79613512c2d757936","object":"chat.completion.chunk","created":1729805261,"model":"ministral-8b-latest","choices":[{"index":0,"delta":{"content":"!"},"finish_reason":null}]}
-
-data: {"id":"e1ebce16de5443b79613512c2d757936","object":"chat.completion.chunk","created":1729805261,"model":"ministral-8b-latest","choices":[{"index":0,"delta":{"content":" How"},"finish_reason":null}]}
-
-data: {"id":"e1ebce16de5443b79613512c2d757936","object":"chat.completion.chunk","created":1729805261,"model":"ministral-8b-latest","choices":[{"index":0,"delta":{"content":" can"},"finish_reason":null}]}
-
-data: {"id":"e1ebce16de5443b79613512c2d757936","object":"chat.completion.chunk","created":1729805261,"model":"ministral-8b-latest","choices":[{"index":0,"delta":{"content":" I"},"finish_reason":null}]}
-
-data: {"id":"e1ebce16de5443b79613512c2d757936","object":"chat.completion.chunk","created":1729805261,"model":"ministral-8b-latest","choices":[{"index":0,"delta":{"content":" assist"},"finish_reason":null}]}
-
-data: {"id":"e1ebce16de5443b79613512c2d757936","object":"chat.completion.chunk","created":1729805261,"model":"ministral-8b-latest","choices":[{"index":0,"delta":{"content":" you"},"finish_reason":null}]}
-
-data: {"id":"e1ebce16de5443b79613512c2d757936","object":"chat.completion.chunk","created":1729805261,"model":"ministral-8b-latest","choices":[{"index":0,"delta":{"content":" today"},"finish_reason":null}]}
-
-data: {"id":"e1ebce16de5443b79613512c2d757936","object":"chat.completion.chunk","created":1729805261,"model":"ministral-8b-latest","choices":[{"index":0,"delta":{"content":"?"},"finish_reason":null}]}
-
-data: {"id":"e1ebce16de5443b79613512c2d757936","object":"chat.completion.chunk","created":1729805261,"model":"ministral-8b-latest","choices":[{"index":0,"delta":{"content":""},"finish_reason":"stop"}],"usage":{"prompt_tokens":4,"total_tokens":13,"completion_tokens":9}}
-
-data: [DONE]
-"#;
-
-        let sever_events: ChatCompletionStreamResponseServerEvents =
-            ChatCompletionStreamResponseServerEvents::try_from(CHUNK_RESPONSE).unwrap();
-        assert_eq!(sever_events.events.len(), 11);
-
-        assert_eq!(
-            sever_events.to_string(),
-            "Hello! How can I assist you today?"
-        );
-    }
-}
--- a/crates/common/src/configuration.rs
+++ b/crates/common/src/configuration.rs
@ -2,6 +2,26 @@ use serde::{Deserialize, Serialize};
 use std::collections::HashMap;
 use std::fmt::Display;

+use crate::api::open_ai::{
+    ChatCompletionTool, FunctionDefinition, FunctionParameter, FunctionParameters, ParameterType,
+};
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct Configuration {
+    pub version: String,
+    pub listener: Listener,
+    pub endpoints: Option<HashMap<String, Endpoint>>,
+    pub llm_providers: Vec<LlmProvider>,
+    pub overrides: Option<Overrides>,
+    pub system_prompt: Option<String>,
+    pub prompt_guards: Option<PromptGuards>,
+    pub prompt_targets: Option<Vec<PromptTarget>>,
+    pub error_target: Option<ErrorTargetDetail>,
+    pub ratelimits: Option<Vec<Ratelimit>>,
+    pub tracing: Option<Tracing>,
+    pub mode: Option<GatewayMode>,
+}
+
 #[derive(Debug, Clone, Serialize, Deserialize, Default)]
 pub struct Overrides {
    pub prompt_target_intent_matching_threshold: Option<f64>,
@ -22,22 +42,6 @@ pub enum GatewayMode {
    Prompt,
 }

-#[derive(Debug, Clone, Serialize, Deserialize)]
-pub struct Configuration {
-    pub version: String,
-    pub listener: Listener,
-    pub endpoints: Option<HashMap<String, Endpoint>>,
-    pub llm_providers: Vec<LlmProvider>,
-    pub overrides: Option<Overrides>,
-    pub system_prompt: Option<String>,
-    pub prompt_guards: Option<PromptGuards>,
-    pub prompt_targets: Option<Vec<PromptTarget>>,
-    pub error_target: Option<ErrorTargetDetail>,
-    pub ratelimits: Option<Vec<Ratelimit>>,
-    pub tracing: Option<Tracing>,
-    pub mode: Option<GatewayMode>,
-}
-
 #[derive(Debug, Clone, Serialize, Deserialize)]
 pub struct ErrorTargetDetail {
    pub endpoint: Option<EndpointDetails>,
@ -192,6 +196,7 @@ pub struct Parameter {
    pub enum_values: Option<Vec<String>>,
    pub default: Option<String>,
    pub in_path: Option<bool>,
+    pub format: Option<String>,
 }

 #[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq, Hash, Default)]
@ -233,11 +238,47 @@ pub struct PromptTarget {
    pub auto_llm_dispatch_on_response: Option<bool>,
 }

+// convert PromptTarget to ChatCompletionTool
+impl From<&PromptTarget> for ChatCompletionTool {
+    fn from(val: &PromptTarget) -> Self {
+        let properties: HashMap<String, FunctionParameter> = match val.parameters {
+            Some(ref entities) => {
+                let mut properties: HashMap<String, FunctionParameter> = HashMap::new();
+                for entity in entities.iter() {
+                    let param = FunctionParameter {
+                        parameter_type: ParameterType::from(
+                            entity.parameter_type.clone().unwrap_or("str".to_string()),
+                        ),
+                        description: entity.description.clone(),
+                        required: entity.required,
+                        enum_values: entity.enum_values.clone(),
+                        default: entity.default.clone(),
+                        format: entity.format.clone(),
+                    };
+                    properties.insert(entity.name.clone(), param);
+                }
+                properties
+            }
+            None => HashMap::new(),
+        };
+
+        ChatCompletionTool {
+            tool_type: crate::api::open_ai::ToolType::Function,
+            function: FunctionDefinition {
+                name: val.name.clone(),
+                description: val.description.clone(),
+                parameters: FunctionParameters { properties },
+            },
+        }
+    }
+}
+
 #[cfg(test)]
 mod test {
+    use pretty_assertions::assert_eq;
    use std::fs;

-    use crate::configuration::GuardType;
+    use crate::{api::open_ai::ToolType, configuration::GuardType};

    #[test]
    fn test_deserialize_configuration() {
@ -309,4 +350,76 @@ mod test {
        let mode = config.mode.as_ref().unwrap_or(&super::GatewayMode::Prompt);
        assert_eq!(*mode, super::GatewayMode::Prompt);
    }
+
+    #[test]
+    fn test_tool_conversion() {
+        let ref_config = fs::read_to_string(
+            "../../docs/source/resources/includes/arch_config_full_reference.yaml",
+        )
+        .expect("reference config file not found");
+        let config: super::Configuration = serde_yaml::from_str(&ref_config).unwrap();
+        let prompt_targets = &config.prompt_targets;
+        let prompt_target = prompt_targets
+            .as_ref()
+            .unwrap()
+            .iter()
+            .find(|p| p.name == "reboot_network_device")
+            .unwrap();
+        let chat_completion_tool: super::ChatCompletionTool = prompt_target.into();
+        assert_eq!(chat_completion_tool.tool_type, ToolType::Function);
+        assert_eq!(chat_completion_tool.function.name, "reboot_network_device");
+        assert_eq!(
+            chat_completion_tool.function.description,
+            "Reboot a specific network device"
+        );
+        assert_eq!(chat_completion_tool.function.parameters.properties.len(), 2);
+        assert_eq!(
+            chat_completion_tool
+                .function
+                .parameters
+                .properties
+                .contains_key("device_id"),
+            true
+        );
+        assert_eq!(
+            chat_completion_tool
+                .function
+                .parameters
+                .properties
+                .get("device_id")
+                .unwrap()
+                .parameter_type,
+            crate::api::open_ai::ParameterType::String
+        );
+        assert_eq!(
+            chat_completion_tool
+                .function
+                .parameters
+                .properties
+                .get("device_id")
+                .unwrap()
+                .description,
+            "Identifier of the network device to reboot.".to_string()
+        );
+        assert_eq!(
+            chat_completion_tool
+                .function
+                .parameters
+                .properties
+                .get("device_id")
+                .unwrap()
+                .required,
+            Some(true)
+        );
+        assert_eq!(
+            chat_completion_tool
+                .function
+                .parameters
+                .properties
+                .get("confirmation")
+                .unwrap()
+                .parameter_type,
+            crate::api::open_ai::ParameterType::Bool
+        );
+    }
 }
--- a/crates/common/src/consts.rs
+++ b/crates/common/src/consts.rs
@ -1,7 +1,3 @@
-pub const DEFAULT_EMBEDDING_MODEL: &str = "katanemo/bge-large-en-v1.5";
-pub const DEFAULT_INTENT_MODEL: &str = "katanemo/bart-large-mnli";
-pub const DEFAULT_PROMPT_TARGET_THRESHOLD: f64 = 0.8;
-pub const DEFAULT_HALLUCINATED_THRESHOLD: f64 = 0.25;
 pub const RATELIMIT_SELECTOR_HEADER_KEY: &str = "x-arch-ratelimit-selector";
 pub const SYSTEM_ROLE: &str = "system";
 pub const USER_ROLE: &str = "user";
@ -9,11 +5,6 @@ pub const TOOL_ROLE: &str = "tool";
 pub const ASSISTANT_ROLE: &str = "assistant";
 pub const ARCH_FC_REQUEST_TIMEOUT_MS: u64 = 120000; // 2 minutes
 pub const MODEL_SERVER_NAME: &str = "model_server";
-pub const ZEROSHOT_INTERNAL_HOST: &str = "zeroshot";
-pub const ARCH_FC_INTERNAL_HOST: &str = "arch_fc";
-pub const HALLUCINATION_INTERNAL_HOST: &str = "hallucination";
-pub const EMBEDDINGS_INTERNAL_HOST: &str = "embeddings";
-pub const GUARD_INTERNAL_HOST: &str = "guard";
 pub const ARCH_ROUTING_HEADER: &str = "x-arch-llm-provider";
 pub const MESSAGES_KEY: &str = "messages";
 pub const ARCH_PROVIDER_HINT_HEADER: &str = "x-arch-llm-provider-hint";
@ -25,7 +16,6 @@ pub const REQUEST_ID_HEADER: &str = "x-request-id";
 pub const TRACE_PARENT_HEADER: &str = "traceparent";
 pub const ARCH_INTERNAL_CLUSTER_NAME: &str = "arch_internal";
 pub const ARCH_UPSTREAM_HOST_HEADER: &str = "x-arch-upstream";
-pub const ARCH_LLM_UPSTREAM_LISTENER: &str = "arch_llm_listener";
 pub const ARCH_MODEL_PREFIX: &str = "Arch";
 pub const HALLUCINATION_TEMPLATE: &str =
    "It seems I'm missing some information. Could you provide the following details ";
--- a/crates/common/src/embeddings/create_embedding_request.rs
+++ b/crates/common/src/embeddings/create_embedding_request.rs
@ -1,59 +0,0 @@
-/*
- * OMF Embeddings
- *
- * No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator)
- *
- * The version of the OpenAPI document: 1.0.0
- *
- * Generated by: https://openapi-generator.tech
- */
-
-use crate::embeddings;
-use serde::{Deserialize, Serialize};
-
-#[derive(Clone, Default, Debug, PartialEq, Serialize, Deserialize)]
-pub struct CreateEmbeddingRequest {
-    #[serde(rename = "input")]
-    pub input: Box<embeddings::CreateEmbeddingRequestInput>,
-    /// ID of the model to use. You can use the [List models](/docs/api-reference/models/list) API to see all of your available models, or see our [Model overview](/docs/models/overview) for descriptions of them.
-    #[serde(rename = "model")]
-    pub model: String,
-    /// The format to return the embeddings in. Can be either `float` or [`base64`](https://pypi.org/project/pybase64/).
-    #[serde(rename = "encoding_format", skip_serializing_if = "Option::is_none")]
-    pub encoding_format: Option<EncodingFormat>,
-    /// The number of dimensions the resulting output embeddings should have. Only supported in `text-embedding-3` and later models.
-    #[serde(rename = "dimensions", skip_serializing_if = "Option::is_none")]
-    pub dimensions: Option<i32>,
-    /// A unique identifier representing your end-user, which can help to monitor and detect abuse. [Learn more](/docs/guides/safety-best-practices/end-user-ids).
-    #[serde(rename = "user", skip_serializing_if = "Option::is_none")]
-    pub user: Option<String>,
-}
-
-impl CreateEmbeddingRequest {
-    pub fn new(
-        input: embeddings::CreateEmbeddingRequestInput,
-        model: String,
-    ) -> CreateEmbeddingRequest {
-        CreateEmbeddingRequest {
-            input: Box::new(input),
-            model,
-            encoding_format: None,
-            dimensions: None,
-            user: None,
-        }
-    }
-}
-/// The format to return the embeddings in. Can be either `float` or [`base64`](https://pypi.org/project/pybase64/).
-#[derive(Clone, Copy, Debug, Eq, PartialEq, Ord, PartialOrd, Hash, Serialize, Deserialize)]
-pub enum EncodingFormat {
-    #[serde(rename = "float")]
-    Float,
-    #[serde(rename = "base64")]
-    Base64,
-}
-
-impl Default for EncodingFormat {
-    fn default() -> EncodingFormat {
-        Self::Float
-    }
-}
--- a/crates/common/src/embeddings/create_embedding_request_input.rs
+++ b/crates/common/src/embeddings/create_embedding_request_input.rs
@ -1,28 +0,0 @@
-/*
- * OMF Embeddings
- *
- * No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator)
- *
- * The version of the OpenAPI document: 1.0.0
- *
- * Generated by: https://openapi-generator.tech
- */
-
-use serde::{Deserialize, Serialize};
-
-/// CreateEmbeddingRequestInput : Input text to embed, encoded as a string or array of tokens. To embed multiple inputs in a single request, pass an array of strings or array of token arrays. The input must not exceed the max input tokens for the model (8192 tokens for `text-embedding-ada-002`), cannot be an empty string, and any array must be 2048 dimensions or less. for counting tokens.
-/// Input text to embed, encoded as a string or array of tokens. To embed multiple inputs in a single request, pass an array of strings or array of token arrays. The input must not exceed the max input tokens for the model (8192 tokens for `text-embedding-ada-002`), cannot be an empty string, and any array must be 2048 dimensions or less. for counting tokens.
-#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
-#[serde(untagged)]
-pub enum CreateEmbeddingRequestInput {
-    /// The string that will be turned into an embedding.
-    String(String),
-    /// The array of integers that will be turned into an embedding.
-    Array(Vec<i32>),
-}
-
-impl Default for CreateEmbeddingRequestInput {
-    fn default() -> Self {
-        Self::String(Default::default())
-    }
-}
--- a/crates/common/src/embeddings/create_embedding_response.rs
+++ b/crates/common/src/embeddings/create_embedding_response.rs
@ -1,55 +0,0 @@
-/*
- * OMF Embeddings
- *
- * No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator)
- *
- * The version of the OpenAPI document: 1.0.0
- *
- * Generated by: https://openapi-generator.tech
- */
-
-use crate::embeddings;
-use serde::{Deserialize, Serialize};
-
-#[derive(Clone, Default, Debug, PartialEq, Serialize, Deserialize)]
-pub struct CreateEmbeddingResponse {
-    /// The list of embeddings generated by the model.
-    #[serde(rename = "data")]
-    pub data: Vec<embeddings::Embedding>,
-    /// The name of the model used to generate the embedding.
-    #[serde(rename = "model")]
-    pub model: String,
-    /// The object type, which is always \"list\".
-    #[serde(rename = "object")]
-    pub object: Object,
-    #[serde(rename = "usage")]
-    pub usage: Box<embeddings::CreateEmbeddingResponseUsage>,
-}
-
-impl CreateEmbeddingResponse {
-    pub fn new(
-        data: Vec<embeddings::Embedding>,
-        model: String,
-        object: Object,
-        usage: embeddings::CreateEmbeddingResponseUsage,
-    ) -> CreateEmbeddingResponse {
-        CreateEmbeddingResponse {
-            data,
-            model,
-            object,
-            usage: Box::new(usage),
-        }
-    }
-}
-/// The object type, which is always \"list\".
-#[derive(Clone, Copy, Debug, Eq, PartialEq, Ord, PartialOrd, Hash, Serialize, Deserialize)]
-pub enum Object {
-    #[serde(rename = "list")]
-    List,
-}
-
-impl Default for Object {
-    fn default() -> Object {
-        Self::List
-    }
-}
--- a/crates/common/src/embeddings/create_embedding_response_usage.rs
+++ b/crates/common/src/embeddings/create_embedding_response_usage.rs
@ -1,32 +0,0 @@
-/*
- * OMF Embeddings
- *
- * No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator)
- *
- * The version of the OpenAPI document: 1.0.0
- *
- * Generated by: https://openapi-generator.tech
- */
-
-use serde::{Deserialize, Serialize};
-
-/// CreateEmbeddingResponseUsage : The usage information for the request.
-#[derive(Clone, Default, Debug, PartialEq, Serialize, Deserialize)]
-pub struct CreateEmbeddingResponseUsage {
-    /// The number of tokens used by the prompt.
-    #[serde(rename = "prompt_tokens")]
-    pub prompt_tokens: i32,
-    /// The total number of tokens used by the request.
-    #[serde(rename = "total_tokens")]
-    pub total_tokens: i32,
-}
-
-impl CreateEmbeddingResponseUsage {
-    /// The usage information for the request.
-    pub fn new(prompt_tokens: i32, total_tokens: i32) -> CreateEmbeddingResponseUsage {
-        CreateEmbeddingResponseUsage {
-            prompt_tokens,
-            total_tokens,
-        }
-    }
-}
--- a/crates/common/src/embeddings/embedding.rs
+++ b/crates/common/src/embeddings/embedding.rs
@ -1,48 +0,0 @@
-/*
- * OMF Embeddings
- *
- * No description provided (generated by Openapi Generator https://github.com/openapitools/openapi-generator)
- *
- * The version of the OpenAPI document: 1.0.0
- *
- * Generated by: https://openapi-generator.tech
- */
-
-use serde::{Deserialize, Serialize};
-
-/// Embedding : Represents an embedding vector returned by embedding endpoint.
-#[derive(Clone, Default, Debug, PartialEq, Serialize, Deserialize)]
-pub struct Embedding {
-    /// The index of the embedding in the list of embeddings.
-    #[serde(rename = "index")]
-    pub index: i32,
-    /// The embedding vector, which is a list of floats. The length of vector depends on the model as listed in the [embedding guide](/docs/guides/embeddings).
-    #[serde(rename = "embedding")]
-    pub embedding: Vec<f64>,
-    /// The object type, which is always \"embedding\"
-    #[serde(rename = "object")]
-    pub object: Object,
-}
-
-impl Embedding {
-    /// Represents an embedding vector returned by embedding endpoint.
-    pub fn new(index: i32, embedding: Vec<f64>, object: Object) -> Embedding {
-        Embedding {
-            index,
-            embedding,
-            object,
-        }
-    }
-}
-/// The object type, which is always \"embedding\"
-#[derive(Clone, Copy, Debug, Eq, PartialEq, Ord, PartialOrd, Hash, Serialize, Deserialize)]
-pub enum Object {
-    #[serde(rename = "embedding")]
-    Embedding,
-}
-
-impl Default for Object {
-    fn default() -> Object {
-        Self::Embedding
-    }
-}
--- a/crates/common/src/embeddings/mod.rs
+++ b/crates/common/src/embeddings/mod.rs
@ -1,10 +0,0 @@
-pub mod create_embedding_request;
-pub use self::create_embedding_request::CreateEmbeddingRequest;
-pub mod create_embedding_request_input;
-pub use self::create_embedding_request_input::CreateEmbeddingRequestInput;
-pub mod create_embedding_response;
-pub use self::create_embedding_response::CreateEmbeddingResponse;
-pub mod create_embedding_response_usage;
-pub use self::create_embedding_response_usage::CreateEmbeddingResponseUsage;
-pub mod embedding;
-pub use self::embedding::Embedding;
--- a/crates/common/src/errors.rs
+++ b/crates/common/src/errors.rs
@ -1,6 +1,6 @@
 use proxy_wasm::types::Status;

-use crate::{common_types::open_ai::ChatCompletionChunkResponseError, ratelimit};
+use crate::{api::open_ai::ChatCompletionChunkResponseError, ratelimit};

 #[derive(thiserror::Error, Debug)]
 pub enum ClientError {
--- a/crates/common/src/lib.rs
+++ b/crates/common/src/lib.rs
@ -1,14 +1,13 @@
-pub mod common_types;
+pub mod api;
 pub mod configuration;
 pub mod consts;
-pub mod embeddings;
 pub mod errors;
 pub mod http;
 pub mod llm_providers;
+pub mod path;
 pub mod pii;
 pub mod ratelimit;
 pub mod routing;
 pub mod stats;
 pub mod tokenizer;
 pub mod tracing;
-pub mod path;
--- a/crates/common/src/path.rs
+++ b/crates/common/src/path.rs
@ -1,6 +1,9 @@
 use std::collections::HashMap;

-pub fn replace_params_in_path(path: &str, params: &HashMap<String, String>) -> Result<String, String> {
+pub fn replace_params_in_path(
+    path: &str,
+    params: &HashMap<String, String>,
+) -> Result<String, String> {
    let mut result = String::new();
    let mut in_param = false;
    let mut current_param = String::new();
@ -17,12 +20,10 @@ pub fn replace_params_in_path(path: &str, params: &HashMap<String, String>) -> R
                return Err(format!("Missing value for parameter `{}`", param_name));
            }
            current_param.clear();
+        } else if in_param {
+            current_param.push(c);
        } else {
-            if in_param {
-                current_param.push(c);
-            } else {
-                result.push(c);
-            }
+            result.push(c);
        }
    }

--- a/crates/common/src/stats.rs
+++ b/crates/common/src/stats.rs
@ -2,7 +2,6 @@ use log::error;
 use proxy_wasm::hostcalls;
 use proxy_wasm::types::*;

-#[allow(unused)]
 pub trait Metric {
    fn id(&self) -> u32;
    fn value(&self) -> Result<u64, String> {
@ -14,7 +13,6 @@ pub trait Metric {
    }
 }

-#[allow(unused)]
 pub trait IncrementingMetric: Metric {
    fn increment(&self, offset: i64) {
        match hostcalls::increment_metric(self.id(), offset) {
@ -24,7 +22,6 @@ pub trait IncrementingMetric: Metric {
    }
 }

-#[allow(unused)]
 pub trait RecordingMetric: Metric {
    fn record(&self, value: u64) {
        match hostcalls::record_metric(self.id(), value) {
@ -39,7 +36,6 @@ pub struct Counter {
    id: u32,
 }

-#[allow(unused)]
 impl Counter {
    pub fn new(name: String) -> Counter {
        let returned_id = hostcalls::define_metric(MetricType::Counter, &name)
@ -85,7 +81,6 @@ pub struct Histogram {
    id: u32,
 }

-#[allow(unused)]
 impl Histogram {
    pub fn new(name: String) -> Histogram {
        let returned_id = hostcalls::define_metric(MetricType::Histogram, &name)
--- a/crates/llm_gateway/src/filter_context.rs
+++ b/crates/llm_gateway/src/filter_context.rs
@ -1,3 +1,4 @@
+use crate::metrics::Metrics;
 use crate::stream_context::StreamContext;
 use common::configuration::Configuration;
 use common::consts::OTEL_COLLECTOR_HTTP;
@ -6,9 +7,7 @@ use common::http::CallArgs;
 use common::http::Client;
 use common::llm_providers::LlmProviders;
 use common::ratelimit;
-use common::stats::Counter;
 use common::stats::Gauge;
-use common::stats::Histogram;
 use common::tracing::TraceData;
 use log::debug;
 use log::warn;
@ -22,39 +21,12 @@ use std::time::Duration;

 use std::sync::{Arc, Mutex};

-#[derive(Copy, Clone, Debug)]
-pub struct WasmMetrics {
-    pub active_http_calls: Gauge,
-    pub ratelimited_rq: Counter,
-    pub time_to_first_token: Histogram,
-    pub time_per_output_token: Histogram,
-    pub tokens_per_second: Histogram,
-    pub request_latency: Histogram,
-    pub output_sequence_length: Histogram,
-    pub input_sequence_length: Histogram,
-}
-
-impl WasmMetrics {
-    fn new() -> WasmMetrics {
-        WasmMetrics {
-            active_http_calls: Gauge::new(String::from("active_http_calls")),
-            ratelimited_rq: Counter::new(String::from("ratelimited_rq")),
-            time_to_first_token: Histogram::new(String::from("time_to_first_token")),
-            time_per_output_token: Histogram::new(String::from("time_per_output_token")),
-            tokens_per_second: Histogram::new(String::from("tokens_per_second")),
-            request_latency: Histogram::new(String::from("request_latency")),
-            output_sequence_length: Histogram::new(String::from("output_sequence_length")),
-            input_sequence_length: Histogram::new(String::from("input_sequence_length")),
-        }
-    }
-}
-
 #[derive(Debug)]
 pub struct CallContext {}

 #[derive(Debug)]
 pub struct FilterContext {
-    metrics: Rc<WasmMetrics>,
+    metrics: Rc<Metrics>,
    // callouts stores token_id to request mapping that we use during #on_http_call_response to match the response to the request.
    callouts: RefCell<HashMap<u32, CallContext>>,
    llm_providers: Option<Rc<LlmProviders>>,
@ -65,7 +37,7 @@ impl FilterContext {
    pub fn new() -> FilterContext {
        FilterContext {
            callouts: RefCell::new(HashMap::new()),
-            metrics: Rc::new(WasmMetrics::new()),
+            metrics: Rc::new(Metrics::new()),
            llm_providers: None,
            traces_queue: Arc::new(Mutex::new(VecDeque::new())),
        }
--- a/crates/llm_gateway/src/lib.rs
+++ b/crates/llm_gateway/src/lib.rs
@ -3,6 +3,7 @@ use proxy_wasm::traits::*;
 use proxy_wasm::types::*;

 mod filter_context;
+mod metrics;
 mod stream_context;

 proxy_wasm::main! {{
--- a/crates/llm_gateway/src/metrics.rs
+++ b/crates/llm_gateway/src/metrics.rs
@ -0,0 +1,28 @@
+use common::stats::{Counter, Gauge, Histogram};
+
+#[derive(Copy, Clone, Debug)]
+pub struct Metrics {
+    pub active_http_calls: Gauge,
+    pub ratelimited_rq: Counter,
+    pub time_to_first_token: Histogram,
+    pub time_per_output_token: Histogram,
+    pub tokens_per_second: Histogram,
+    pub request_latency: Histogram,
+    pub output_sequence_length: Histogram,
+    pub input_sequence_length: Histogram,
+}
+
+impl Metrics {
+    pub fn new() -> Metrics {
+        Metrics {
+            active_http_calls: Gauge::new(String::from("active_http_calls")),
+            ratelimited_rq: Counter::new(String::from("ratelimited_rq")),
+            time_to_first_token: Histogram::new(String::from("time_to_first_token")),
+            time_per_output_token: Histogram::new(String::from("time_per_output_token")),
+            tokens_per_second: Histogram::new(String::from("tokens_per_second")),
+            request_latency: Histogram::new(String::from("request_latency")),
+            output_sequence_length: Histogram::new(String::from("output_sequence_length")),
+            input_sequence_length: Histogram::new(String::from("input_sequence_length")),
+        }
+    }
+}
--- a/crates/llm_gateway/src/stream_context.rs
+++ b/crates/llm_gateway/src/stream_context.rs
@ -1,5 +1,5 @@
-use crate::filter_context::WasmMetrics;
-use common::common_types::open_ai::{
+use crate::metrics::Metrics;
+use common::api::open_ai::{
    ChatCompletionStreamResponseServerEvents, ChatCompletionsRequest, ChatCompletionsResponse,
    Message, StreamOptions,
 };
@ -12,25 +12,23 @@ use common::errors::ServerError;
 use common::llm_providers::LlmProviders;
 use common::pii::obfuscate_auth_header;
 use common::ratelimit::Header;
+use common::stats::{IncrementingMetric, RecordingMetric};
 use common::tracing::{Event, Span, TraceData, Traceparent};
 use common::{ratelimit, routing, tokenizer};
 use http::StatusCode;
 use log::{debug, trace, warn};
+use proxy_wasm::hostcalls::get_current_time;
 use proxy_wasm::traits::*;
 use proxy_wasm::types::*;
 use std::collections::VecDeque;
 use std::num::NonZero;
 use std::rc::Rc;
 use std::sync::{Arc, Mutex};
-
-use common::stats::{IncrementingMetric, RecordingMetric};
-
-use proxy_wasm::hostcalls::get_current_time;
 use std::time::{Duration, SystemTime, UNIX_EPOCH};

 pub struct StreamContext {
    context_id: u32,
-    metrics: Rc<WasmMetrics>,
+    metrics: Rc<Metrics>,
    ratelimit_selector: Option<Header>,
    streaming_response: bool,
    response_tokens: usize,
@ -50,7 +48,7 @@ pub struct StreamContext {
 impl StreamContext {
    pub fn new(
        context_id: u32,
-        metrics: Rc<WasmMetrics>,
+        metrics: Rc<Metrics>,
        llm_providers: Rc<LlmProviders>,
        traces_queue: Arc<Mutex<VecDeque<TraceData>>>,
    ) -> Self {
--- a/crates/prompt_gateway/src/context.rs
+++ b/crates/prompt_gateway/src/context.rs
@ -1,5 +1,9 @@
+use std::str::FromStr;
+
 use common::errors::ServerError;
 use common::stats::IncrementingMetric;
+use http::StatusCode;
+use log::{debug, warn};
 use proxy_wasm::traits::Context;

 use crate::stream_context::{ResponseHandlerType, StreamContext};
@ -19,76 +23,34 @@ impl Context for StreamContext {
            .expect("invalid token_id");
        self.metrics.active_http_calls.increment(-1);

-        /*
-        state transition
+        let body = self
+            .get_http_call_response_body(0, body_size)
+            .unwrap_or(vec![]);

-        graph LR
-
-        on_http_request_body --> prompt received
-        prompt received --> get embeddings & arch guard
-        arch guard --> get embeddings
-        get embeddings --> zeroshot intent
-
-        ┌──────────────────────┐   ┌─────────────────┐   ┌────────────────┐   ┌─────────────────┐
-        │                      │   │                 │   │                │   │                 │
-        │ on_http_request_body ├──►│ prompt received ├──►│ get embeddings ├──►│ zeroshot intent │
-        │                      │   │                 │   │                │   │                 │
-        └──────────────────────┘   └────────┬────────┘   └────────────────┘   └─────────────────┘
-                                            │                     ▲
-                                            │                     │
-                                            │                     │
-                                            │            ┌────────┴───────┐
-                                            │            │                │
-                                            └───────────►│   arch guard   │
-                                                         │                │
-                                                         └────────────────┘
-
-
-        continue from zeroshot intent
-
-        graph LR
-
-        zeroshot intent --> arch_fc
-        zeroshot intent --> default prompt target
-        arch_fc --> developer api call & hallucination check
-        hallucination check --> parameter gathering & developer api call
-        developer api call --> resume request to llm
-
-
-        ┌─────────────────┐   ┌───────────────────────┐   ┌─────────────────────┐   ┌───────────────────────┐
-        │                 │   │                       │   │                     │   │                       │
-        │ zeroshot intent ├──►│        arch_fc        ├──►│  developer api call ├──►│ resume request to llm │
-        │                 │   │                       │   │                     │   │                       │
-        └────────┬────────┘   └───────────┬───────────┘   └─────────────────────┘   └───────────────────────┘
-               │                        │                          ▲
-               │                        └─────────────┐            │
-               │                                      │            │
-               │            ┌───────────────────────┐ │ ┌──────────┴──────────┐   ┌───────────────────────┐
-               │            │                       │ │ │                     │   │                       │
-               └───────────►│ default prompt target │ └▲│ hallucination check ├──►│  parameter gathering  │
-                            │                       │   │                     │   │                       │
-                            └───────────────────────┘   └─────────────────────┘   └───────────────────────┘
-
-
-        using https://mermaid-ascii.art/
-        */
-
-        if let Some(body) = self.get_http_call_response_body(0, body_size) {
-            #[cfg_attr(any(), rustfmt::skip)]
-            match callout_context.response_handler_type {
-                ResponseHandlerType::ArchGuard => self.arch_guard_handler(body, callout_context),
-                ResponseHandlerType::Embeddings => self.embeddings_handler(body, callout_context),
-                ResponseHandlerType::ZeroShotIntent => self.zero_shot_intent_detection_resp_handler(body, callout_context),
-                ResponseHandlerType::ArchFC => self.arch_fc_response_handler(body, callout_context),
-                ResponseHandlerType::Hallucination => self.hallucination_classification_resp_handler(body, callout_context),
-                ResponseHandlerType::FunctionCall => self.api_call_response_handler(body, callout_context),
-                ResponseHandlerType::DefaultTarget =>self.default_target_handler(body, callout_context),
-            }
-        } else {
-            self.send_server_error(
-                ServerError::LogicError(String::from("No response body in inline HTTP request")),
-                None,
+        let http_status = self
+            .get_http_call_response_header(":status")
+            .unwrap_or(StatusCode::OK.as_str().to_string());
+        debug!("http call response code: {}", http_status);
+        if http_status != StatusCode::OK.as_str() {
+            let server_error = ServerError::Upstream {
+                host: callout_context.upstream_cluster.unwrap(),
+                path: callout_context.upstream_cluster_path.unwrap(),
+                status: http_status.clone(),
+                body: String::from_utf8(body).unwrap(),
+            };
+            warn!("filter received non 2xx code: {:?}", server_error);
+            return self.send_server_error(
+                server_error,
+                Some(StatusCode::from_str(http_status.as_str()).unwrap()),
            );
        }
+
+        debug!("http call response handler type: {:?}", callout_context.response_handler_type);
+        #[cfg_attr(any(), rustfmt::skip)]
+        match callout_context.response_handler_type {
+            ResponseHandlerType::ArchFC => self.arch_fc_response_handler(body, callout_context),
+            ResponseHandlerType::FunctionCall => self.api_call_response_handler(body, callout_context),
+            ResponseHandlerType::DefaultTarget =>self.default_target_handler(body, callout_context),
+        }
    }
 }
--- a/crates/prompt_gateway/src/filter_context.rs
+++ b/crates/prompt_gateway/src/filter_context.rs
@ -1,60 +1,27 @@
+use crate::metrics::Metrics;
 use crate::stream_context::StreamContext;
-use common::common_types::EmbeddingType;
 use common::configuration::{Configuration, Overrides, PromptGuards, PromptTarget, Tracing};
-use common::consts::ARCH_UPSTREAM_HOST_HEADER;
-use common::consts::DEFAULT_EMBEDDING_MODEL;
-use common::consts::{ARCH_INTERNAL_CLUSTER_NAME, EMBEDDINGS_INTERNAL_HOST};
-use common::embeddings::{
-    CreateEmbeddingRequest, CreateEmbeddingRequestInput, CreateEmbeddingResponse,
-};
-use common::http::CallArgs;
 use common::http::Client;
 use common::stats::Gauge;
-use common::stats::IncrementingMetric;
-use http::StatusCode;
-use log::{debug, info, trace, warn};
+use log::debug;
 use proxy_wasm::traits::*;
 use proxy_wasm::types::*;
 use std::cell::RefCell;
-use std::collections::hash_map::Entry;
 use std::collections::HashMap;
 use std::rc::Rc;
-use std::time::Duration;
-
-#[derive(Copy, Clone, Debug)]
-pub struct WasmMetrics {
-    pub active_http_calls: Gauge,
-}
-
-impl WasmMetrics {
-    fn new() -> WasmMetrics {
-        WasmMetrics {
-            active_http_calls: Gauge::new(String::from("active_http_calls")),
-        }
-    }
-}
-
-pub type EmbeddingTypeMap = HashMap<EmbeddingType, Vec<f64>>;
-pub type EmbeddingsStore = HashMap<String, EmbeddingTypeMap>;

 #[derive(Debug)]
-pub struct FilterCallContext {
-    pub prompt_target_name: String,
-    pub embedding_type: EmbeddingType,
-}
+pub struct FilterCallContext {}

 #[derive(Debug)]
 pub struct FilterContext {
-    metrics: Rc<WasmMetrics>,
+    metrics: Rc<Metrics>,
    // callouts stores token_id to request mapping that we use during #on_http_call_response to match the response to the request.
    callouts: RefCell<HashMap<u32, FilterCallContext>>,
    overrides: Rc<Option<Overrides>>,
    system_prompt: Rc<Option<String>>,
    prompt_targets: Rc<HashMap<String, PromptTarget>>,
    prompt_guards: Rc<PromptGuards>,
-    embeddings_store: Option<Rc<EmbeddingsStore>>,
-    temp_embeddings_store: EmbeddingsStore,
-    active_embedding_calls_count: u32,
    tracing: Rc<Option<Tracing>>,
 }

@ -62,136 +29,14 @@ impl FilterContext {
    pub fn new() -> FilterContext {
        FilterContext {
            callouts: RefCell::new(HashMap::new()),
-            metrics: Rc::new(WasmMetrics::new()),
+            metrics: Rc::new(Metrics::new()),
            system_prompt: Rc::new(None),
            prompt_targets: Rc::new(HashMap::new()),
            overrides: Rc::new(None),
            prompt_guards: Rc::new(PromptGuards::default()),
-            embeddings_store: Some(Rc::new(HashMap::new())),
-            temp_embeddings_store: HashMap::new(),
-            active_embedding_calls_count: 0,
            tracing: Rc::new(None),
        }
    }
-
-    fn process_prompt_targets(&mut self) {
-        let prompt_target_description: Vec<(String, String)> = self
-            .prompt_targets
-            .iter()
-            .map(|(k, v)| (k.clone(), v.description.clone()))
-            .collect();
-
-        prompt_target_description
-            .iter()
-            .for_each(|(name, description)| {
-                self.schedule_embeddings_call(name, description, EmbeddingType::Description);
-            });
-    }
-
-    fn schedule_embeddings_call(
-        &mut self,
-        prompt_target_name: &str,
-        input: &str,
-        embedding_type: EmbeddingType,
-    ) {
-        let embeddings_input = CreateEmbeddingRequest {
-            input: Box::new(CreateEmbeddingRequestInput::String(String::from(input))),
-            model: String::from(DEFAULT_EMBEDDING_MODEL),
-            encoding_format: None,
-            dimensions: None,
-            user: None,
-        };
-        let json_data = serde_json::to_string(&embeddings_input).unwrap();
-
-        let call_args = CallArgs::new(
-            ARCH_INTERNAL_CLUSTER_NAME,
-            "/embeddings",
-            vec![
-                (ARCH_UPSTREAM_HOST_HEADER, EMBEDDINGS_INTERNAL_HOST),
-                (":method", "POST"),
-                (":path", "/embeddings"),
-                (":authority", EMBEDDINGS_INTERNAL_HOST),
-                ("content-type", "application/json"),
-                ("x-envoy-upstream-rq-timeout-ms", "60000"),
-            ],
-            Some(json_data.as_bytes()),
-            vec![],
-            Duration::from_secs(60),
-        );
-
-        let call_context = crate::filter_context::FilterCallContext {
-            prompt_target_name: String::from(prompt_target_name),
-            embedding_type,
-        };
-
-        self.active_embedding_calls_count += 1;
-        if let Err(error) = self.http_call(call_args, call_context) {
-            panic!("{error}")
-        }
-    }
-
-    fn embedding_response_handler(
-        &mut self,
-        embedding_type: EmbeddingType,
-        prompt_target_name: String,
-        body: Vec<u8>,
-    ) {
-        let prompt_target = self
-            .prompt_targets
-            .get(&prompt_target_name)
-            .unwrap_or_else(|| {
-                panic!(
-                    "Received embeddings response for unknown prompt target name={}",
-                    prompt_target_name
-                )
-            });
-
-        if !body.is_empty() {
-            let mut embedding_response: CreateEmbeddingResponse =
-                match serde_json::from_slice(&body) {
-                    Ok(response) => response,
-                    Err(e) => {
-                        panic!(
-                            "Error deserializing embedding response. body: {:?}: {:?}",
-                            String::from_utf8(body).unwrap(),
-                            e
-                        );
-                    }
-                };
-
-            let embeddings = embedding_response.data.remove(0).embedding;
-            debug!(
-                    "Adding embeddings for prompt target name: {:?}, description: {:?}, embedding type: {:?}",
-                    prompt_target.name,
-                    prompt_target.description,
-                    embedding_type
-                );
-
-            let entry = self.temp_embeddings_store.entry(prompt_target_name);
-            match entry {
-                Entry::Occupied(_) => {
-                    entry.and_modify(|e| {
-                        if let Entry::Vacant(e) = e.entry(embedding_type) {
-                            e.insert(embeddings);
-                        } else {
-                            panic!(
-                                "Duplicate {:?} for prompt target with name=\"{}\"",
-                                &embedding_type, prompt_target.name
-                            )
-                        }
-                    });
-                }
-                Entry::Vacant(_) => {
-                    entry.or_insert(HashMap::from([(embedding_type, embeddings)]));
-                }
-            }
-
-            if self.prompt_targets.len() == self.temp_embeddings_store.len() {
-                self.embeddings_store =
-                    Some(Rc::new(std::mem::take(&mut self.temp_embeddings_store)))
-            }
-        }
-    }
 }

 impl Client for FilterContext {
@ -206,46 +51,7 @@ impl Client for FilterContext {
    }
 }

-impl Context for FilterContext {
-    fn on_http_call_response(
-        &mut self,
-        token_id: u32,
-        _num_headers: usize,
-        body_size: usize,
-        _num_trailers: usize,
-    ) {
-        trace!(
-            "filter_context: on_http_call_response called with token_id: {:?}",
-            token_id
-        );
-        let callout_data = self
-            .callouts
-            .borrow_mut()
-            .remove(&token_id)
-            .expect("invalid token_id");
-
-        self.active_embedding_calls_count -= 1;
-        self.metrics.active_http_calls.increment(-1);
-        let body_bytes = self.get_http_call_response_body(0, body_size).unwrap();
-
-        if let Some(status_code) = self.get_http_call_response_header(":status") {
-            if status_code == StatusCode::OK.as_str() {
-                self.embedding_response_handler(
-                    callout_data.embedding_type,
-                    callout_data.prompt_target_name,
-                    body_bytes,
-                );
-            } else {
-                warn!(
-                    "Received non-200 status code: {} for callout with token_id: {}: body_str: {}",
-                    status_code,
-                    token_id,
-                    String::from_utf8(body_bytes).unwrap()
-                );
-            }
-        }
-    }
-}
+impl Context for FilterContext {}

 // RootContext allows the Rust code to reach into the Envoy Config
 impl RootContext for FilterContext {
@ -283,15 +89,12 @@ impl RootContext for FilterContext {
            context_id
        );

-        let embedding_store = self.embeddings_store.as_ref().map(Rc::clone);
        Some(Box::new(StreamContext::new(
            context_id,
            Rc::clone(&self.metrics),
            Rc::clone(&self.system_prompt),
            Rc::clone(&self.prompt_targets),
-            Rc::clone(&self.prompt_guards),
            Rc::clone(&self.overrides),
-            embedding_store,
            Rc::clone(&self.tracing),
        )))
    }
@ -301,25 +104,6 @@ impl RootContext for FilterContext {
    }

    fn on_vm_start(&mut self, _: usize) -> bool {
-        self.set_tick_period(Duration::from_secs(1));
        true
    }
-
-    fn on_tick(&mut self) {
-        if self.embeddings_store.is_some()
-            && self.embeddings_store.as_ref().unwrap().len() == self.prompt_targets.len()
-        {
-            info!("embeddings store initialized");
-            self.set_tick_period(Duration::from_secs(0));
-        } else {
-            if self.active_embedding_calls_count == 0 {
-                info!("retrieving embeddings from embedding server");
-                self.process_prompt_targets();
-            } else {
-                info!("waiting for embeddings store to be initialized");
-            }
-
-            self.set_tick_period(Duration::from_secs(5));
-        }
-    }
 }
--- a/crates/prompt_gateway/src/http_context.rs
+++ b/crates/prompt_gateway/src/http_context.rs
@ -1,15 +1,12 @@
 use crate::stream_context::{ResponseHandlerType, StreamCallContext, StreamContext};
 use common::{
-    common_types::{
-        open_ai::{
-            to_server_events, ArchState, ChatCompletionStreamResponse, ChatCompletionsRequest,
-        },
-        PromptGuardRequest, PromptGuardTask,
+    api::open_ai::{
+        self, ArchState, ChatCompletionStreamResponse, ChatCompletionTool, ChatCompletionsRequest,
    },
    consts::{
        ARCH_FC_MODEL_NAME, ARCH_INTERNAL_CLUSTER_NAME, ARCH_STATE_HEADER,
-        ARCH_UPSTREAM_HOST_HEADER, ASSISTANT_ROLE, CHAT_COMPLETIONS_PATH, GUARD_INTERNAL_HOST,
-        HEALTHZ_PATH, REQUEST_ID_HEADER, TOOL_ROLE, TRACE_PARENT_HEADER, USER_ROLE,
+        ARCH_UPSTREAM_HOST_HEADER, ASSISTANT_ROLE, CHAT_COMPLETIONS_PATH, HEALTHZ_PATH,
+        MODEL_SERVER_NAME, REQUEST_ID_HEADER, TOOL_ROLE, TRACE_PARENT_HEADER, USER_ROLE,
    },
    errors::ServerError,
    http::{CallArgs, Client},
@ -37,11 +34,7 @@ impl HttpContext for StreamContext {

        let request_path = self.get_http_request_header(":path").unwrap_or_default();
        if request_path == HEALTHZ_PATH {
-            if self.is_embedding_store_initialized() {
-                self.send_http_response(200, vec![], None);
-            } else {
-                self.send_http_response(503, vec![], None);
-            }
+            self.send_http_response(200, vec![], None);
            return Action::Continue;
        }

@ -140,43 +133,25 @@ impl HttpContext for StreamContext {

        self.user_prompt = Some(last_user_prompt.clone());

-        let user_message_str = self.user_prompt.as_ref().unwrap().content.clone();
+        // convert prompt targets to ChatCompletionTool
+        let tool_calls: Vec<ChatCompletionTool> = self
+            .prompt_targets
+            .iter()
+            .map(|(_, pt)| pt.into())
+            .collect();

-        let prompt_guard_jailbreak_task = self
-            .prompt_guards
-            .input_guards
-            .contains_key(&common::configuration::GuardType::Jailbreak);
+        let arch_fc_chat_completion_request = ChatCompletionsRequest {
+            messages: deserialized_body.messages.clone(),
+            metadata: deserialized_body.metadata.clone(),
+            stream: deserialized_body.stream,
+            model: "--".to_string(),
+            stream_options: deserialized_body.stream_options.clone(),
+            tools: Some(tool_calls),
+        };

        self.chat_completions_request = Some(deserialized_body);

-        if !prompt_guard_jailbreak_task {
-            debug!("Missing input guard. Making inline call to retrieve embeddings");
-            let callout_context = StreamCallContext {
-                response_handler_type: ResponseHandlerType::ArchGuard,
-                user_message: user_message_str.clone(),
-                prompt_target_name: None,
-                request_body: self.chat_completions_request.as_ref().unwrap().clone(),
-                similarity_scores: None,
-                upstream_cluster: None,
-                upstream_cluster_path: None,
-            };
-            self.get_embeddings(callout_context);
-            return Action::Pause;
-        }
-
-        let get_prompt_guards_request = PromptGuardRequest {
-            input: self
-                .user_prompt
-                .as_ref()
-                .unwrap()
-                .content
-                .as_ref()
-                .unwrap()
-                .clone(),
-            task: PromptGuardTask::Jailbreak,
-        };
-
-        let json_data: String = match serde_json::to_string(&get_prompt_guards_request) {
+        let json_data = match serde_json::to_string(&arch_fc_chat_completion_request) {
            Ok(json_data) => json_data,
            Err(error) => {
                self.send_server_error(ServerError::Serialization(error), None);
@ -184,14 +159,14 @@ impl HttpContext for StreamContext {
            }
        };

+        debug!("archgw => archfc: {}", json_data);
+
        let mut headers = vec![
-            (ARCH_UPSTREAM_HOST_HEADER, GUARD_INTERNAL_HOST),
+            (ARCH_UPSTREAM_HOST_HEADER, MODEL_SERVER_NAME),
            (":method", "POST"),
-            (":path", "/guard"),
-            (":authority", GUARD_INTERNAL_HOST),
+            (":path", "/function_calling"),
            ("content-type", "application/json"),
-            ("x-envoy-max-retries", "3"),
-            ("x-envoy-upstream-rq-timeout-ms", "60000"),
+            (":authority", MODEL_SERVER_NAME),
        ];

        if self.request_id.is_some() {
@ -204,23 +179,25 @@ impl HttpContext for StreamContext {

        let call_args = CallArgs::new(
            ARCH_INTERNAL_CLUSTER_NAME,
-            "/guard",
+            "/function_calling",
            headers,
            Some(json_data.as_bytes()),
            vec![],
            Duration::from_secs(5),
        );
+
        let call_context = StreamCallContext {
-            response_handler_type: ResponseHandlerType::ArchGuard,
+            response_handler_type: ResponseHandlerType::ArchFC,
            user_message: self.user_prompt.as_ref().unwrap().content.clone(),
            prompt_target_name: None,
            request_body: self.chat_completions_request.as_ref().unwrap().clone(),
            similarity_scores: None,
-            upstream_cluster: None,
-            upstream_cluster_path: None,
+            upstream_cluster: Some(ARCH_INTERNAL_CLUSTER_NAME.to_string()),
+            upstream_cluster_path: Some("/function_calling".to_string()),
        };

        if let Err(e) = self.http_call(call_args, call_context) {
+            debug!("http_call failed: {:?}", e);
            self.send_server_error(ServerError::HttpDispatch(e), None);
        }

@ -324,7 +301,7 @@ impl HttpContext for StreamContext {
                    ),
                ];

-                let mut response_str = to_server_events(chunks);
+                let mut response_str = open_ai::to_server_events(chunks);
                // append the original response from the model to the stream
                response_str.push_str(&body_utf8);
                self.set_http_response_body(0, body_size, response_str.as_bytes());
@ -339,9 +316,11 @@ impl HttpContext for StreamContext {
                let mut data = match serde_json::from_str(&body_utf8) {
                    Ok(data) => data,
                    Err(e) => {
-                        warn!("could not deserialize response: {}", e);
-                        self.send_server_error(ServerError::Deserialization(e), None);
-                        return Action::Pause;
+                        warn!(
+                            "could not deserialize response, sending data as it is: {}",
+                            e
+                        );
+                        return Action::Continue;
                    }
                };
                // use serde::Value to manipulate the json object and ensure that we don't lose any data
--- a/crates/prompt_gateway/src/lib.rs
+++ b/crates/prompt_gateway/src/lib.rs
@ -4,8 +4,8 @@ use proxy_wasm::types::*;

 mod context;
 mod filter_context;
-mod hallucination;
 mod http_context;
+mod metrics;
 mod stream_context;

 proxy_wasm::main! {{
--- a/crates/prompt_gateway/src/metrics.rs
+++ b/crates/prompt_gateway/src/metrics.rs
@ -0,0 +1,14 @@
+use common::stats::Gauge;
+
+#[derive(Copy, Clone, Debug)]
+pub struct Metrics {
+    pub active_http_calls: Gauge,
+}
+
+impl Metrics {
+    pub fn new() -> Metrics {
+        Metrics {
+            active_http_calls: Gauge::new(String::from("active_http_calls")),
+        }
+    }
+}
--- a/crates/prompt_gateway/src/stream_context.rs
+++ b/crates/prompt_gateway/src/stream_context.rs
--- a/crates/prompt_gateway/tests/integration.rs
+++ b/crates/prompt_gateway/tests/integration.rs
@ -1,11 +1,7 @@
-use common::common_types::open_ai::{ChatCompletionsResponse, Choice, Message, Usage};
-use common::common_types::open_ai::{FunctionCallDetail, ToolCall, ToolType};
-use common::common_types::{HallucinationClassificationResponse, PromptGuardResponse};
-use common::embeddings::{
-    create_embedding_response, embedding, CreateEmbeddingResponse, CreateEmbeddingResponseUsage,
-    Embedding,
+use common::api::open_ai::{
+    ChatCompletionsResponse, Choice, FunctionCallDetail, Message, ToolCall, ToolType, Usage,
 };
-use common::{common_types::ZeroShotClassificationResponse, configuration::Configuration};
+use common::configuration::Configuration;
 use http::StatusCode;
 use proxy_wasm_test_framework::tester::{self, Tester};
 use proxy_wasm_test_framework::types::{
@ -80,13 +76,11 @@ fn normal_flow(module: &mut Tester, filter_context: i32, http_context: i32) {
        .expect_http_call(
            Some("arch_internal"),
            Some(vec![
-                ("x-arch-upstream", "guard"),
+                ("x-arch-upstream", "model_server"),
                (":method", "POST"),
-                (":path", "/guard"),
-                (":authority", "guard"),
+                (":path", "/function_calling"),
                ("content-type", "application/json"),
-                ("x-envoy-max-retries", "3"),
-                ("x-envoy-upstream-rq-timeout-ms", "60000"),
+                (":authority", "model_server"),
            ]),
            None,
            None,
@ -94,139 +88,11 @@ fn normal_flow(module: &mut Tester, filter_context: i32, http_context: i32) {
        )
        .returning(Some(1))
        .expect_log(Some(LogLevel::Debug), None)
+        .expect_log(Some(LogLevel::Debug), None)
        .expect_log(Some(LogLevel::Trace), None)
        .expect_metric_increment("active_http_calls", 1)
        .execute_and_expect(ReturnType::Action(Action::Pause))
        .unwrap();
-
-    let prompt_guard_response = PromptGuardResponse {
-        toxic_prob: None,
-        toxic_verdict: None,
-        jailbreak_prob: None,
-        jailbreak_verdict: None,
-    };
-    let prompt_guard_response_buffer = serde_json::to_string(&prompt_guard_response).unwrap();
-    module
-        .call_proxy_on_http_call_response(
-            http_context,
-            1,
-            0,
-            prompt_guard_response_buffer.len() as i32,
-            0,
-        )
-        .expect_metric_increment("active_http_calls", -1)
-        .expect_get_buffer_bytes(Some(BufferType::HttpCallResponseBody))
-        .returning(Some(&prompt_guard_response_buffer))
-        .expect_log(Some(LogLevel::Debug), None)
-        .expect_log(Some(LogLevel::Debug), None)
-        .expect_log(Some(LogLevel::Trace), None)
-        .expect_http_call(
-            Some("arch_internal"),
-            Some(vec![
-                ("x-arch-upstream", "embeddings"),
-                (":method", "POST"),
-                (":path", "/embeddings"),
-                (":authority", "embeddings"),
-                ("content-type", "application/json"),
-                ("x-envoy-max-retries", "3"),
-                ("x-envoy-upstream-rq-timeout-ms", "60000"),
-            ]),
-            None,
-            None,
-            None,
-        )
-        .returning(Some(2))
-        .expect_metric_increment("active_http_calls", 1)
-        .execute_and_expect(ReturnType::None)
-        .unwrap();
-
-    let embedding_response = CreateEmbeddingResponse {
-        data: vec![Embedding {
-            index: 0,
-            embedding: vec![],
-            object: embedding::Object::default(),
-        }],
-        model: String::from("test"),
-        object: create_embedding_response::Object::default(),
-        usage: Box::new(CreateEmbeddingResponseUsage::new(0, 0)),
-    };
-    let embeddings_response_buffer = serde_json::to_string(&embedding_response).unwrap();
-    module
-        .call_proxy_on_http_call_response(
-            http_context,
-            2,
-            0,
-            embeddings_response_buffer.len() as i32,
-            0,
-        )
-        .expect_metric_increment("active_http_calls", -1)
-        .expect_get_buffer_bytes(Some(BufferType::HttpCallResponseBody))
-        .returning(Some(&embeddings_response_buffer))
-        .expect_log(Some(LogLevel::Trace), None)
-        .expect_log(Some(LogLevel::Warn), None)
-        .expect_log(Some(LogLevel::Debug), None)
-        .expect_log(Some(LogLevel::Trace), None)
-        .expect_http_call(
-            Some("arch_internal"),
-            Some(vec![
-                ("x-arch-upstream", "zeroshot"),
-                (":method", "POST"),
-                (":path", "/zeroshot"),
-                (":authority", "zeroshot"),
-                ("content-type", "application/json"),
-                ("x-envoy-max-retries", "3"),
-                ("x-envoy-upstream-rq-timeout-ms", "60000"),
-            ]),
-            None,
-            None,
-            None,
-        )
-        .returning(Some(3))
-        .expect_metric_increment("active_http_calls", 1)
-        .execute_and_expect(ReturnType::None)
-        .unwrap();
-
-    let zero_shot_response = ZeroShotClassificationResponse {
-        predicted_class: "weather_forecast".to_string(),
-        predicted_class_score: 0.1,
-        scores: HashMap::new(),
-        model: "test-model".to_string(),
-    };
-    let zeroshot_intent_detection_buffer = serde_json::to_string(&zero_shot_response).unwrap();
-    module
-        .call_proxy_on_http_call_response(
-            http_context,
-            3,
-            0,
-            zeroshot_intent_detection_buffer.len() as i32,
-            0,
-        )
-        .expect_metric_increment("active_http_calls", -1)
-        .expect_get_buffer_bytes(Some(BufferType::HttpCallResponseBody))
-        .returning(Some(&zeroshot_intent_detection_buffer))
-        .expect_log(Some(LogLevel::Trace), None)
-        .expect_log(Some(LogLevel::Debug), None)
-        .expect_log(Some(LogLevel::Debug), None)
-        .expect_log(Some(LogLevel::Trace), None)
-        .expect_http_call(
-            Some("arch_internal"),
-            Some(vec![
-                (":method", "POST"),
-                ("x-arch-upstream", "arch_fc"),
-                (":path", "/v1/chat/completions"),
-                (":authority", "arch_fc"),
-                ("content-type", "application/json"),
-                ("x-envoy-max-retries", "3"),
-                ("x-envoy-upstream-rq-timeout-ms", "120000"),
-            ]),
-            None,
-            None,
-            None,
-        )
-        .returning(Some(4))
-        .expect_metric_increment("active_http_calls", 1)
-        .execute_and_expect(ReturnType::None)
-        .unwrap();
 }

 fn setup_filter(module: &mut Tester, config: &str) -> i32 {
@ -245,69 +111,6 @@ fn setup_filter(module: &mut Tester, config: &str) -> i32 {
        .execute_and_expect(ReturnType::Bool(true))
        .unwrap();

-    module
-        .call_proxy_on_tick(filter_context)
-        .expect_log(Some(LogLevel::Info), None)
-        .expect_log(Some(LogLevel::Trace), None)
-        .expect_http_call(
-            Some("arch_internal"),
-            Some(vec![
-                ("x-arch-upstream", "embeddings"),
-                (":method", "POST"),
-                (":path", "/embeddings"),
-                (":authority", "embeddings"),
-                ("content-type", "application/json"),
-                ("x-envoy-upstream-rq-timeout-ms", "60000"),
-            ]),
-            None,
-            None,
-            None,
-        )
-        .returning(Some(101))
-        .expect_metric_increment("active_http_calls", 1)
-        .expect_set_tick_period_millis(Some(5000))
-        .execute_and_expect(ReturnType::None)
-        .unwrap();
-
-    let embedding_response = CreateEmbeddingResponse {
-        data: vec![Embedding {
-            embedding: vec![],
-            index: 0,
-            object: embedding::Object::default(),
-        }],
-        model: String::from("test"),
-        object: create_embedding_response::Object::default(),
-        usage: Box::new(CreateEmbeddingResponseUsage {
-            prompt_tokens: 0,
-            total_tokens: 0,
-        }),
-    };
-    let embedding_response_str = serde_json::to_string(&embedding_response).unwrap();
-    module
-        .call_proxy_on_http_call_response(
-            filter_context,
-            101,
-            0,
-            embedding_response_str.len() as i32,
-            0,
-        )
-        .expect_log(
-            Some(LogLevel::Trace),
-            Some(
-                format!(
-                    "filter_context: on_http_call_response called with token_id: {:?}",
-                    101
-                )
-                .as_str(),
-            ),
-        )
-        .expect_metric_increment("active_http_calls", -1)
-        .expect_get_buffer_bytes(Some(BufferType::HttpCallResponseBody))
-        .returning(Some(&embedding_response_str))
-        .expect_log(Some(LogLevel::Debug), None)
-        .execute_and_expect(ReturnType::None)
-        .unwrap();
-
    filter_context
 }

@ -432,6 +235,7 @@ fn prompt_gateway_successful_request_to_open_ai_chat_completions() {
        .returning(Some(chat_completions_request_body))
        .expect_log(Some(LogLevel::Trace), None)
        .expect_log(Some(LogLevel::Debug), None)
+        .expect_log(Some(LogLevel::Debug), None)
        .expect_log(Some(LogLevel::Trace), None)
        .expect_http_call(Some("arch_internal"), None, None, None, None)
        .returning(Some(4))
@ -535,8 +339,8 @@ fn prompt_gateway_request_to_llm_gateway() {
            completion_tokens: 0,
        }),
        choices: vec![Choice {
-            finish_reason: "test".to_string(),
-            index: 0,
+            finish_reason: Some("test".to_string()),
+            index: Some(0),
            message: Message {
                role: "system".to_string(),
                content: None,
@ -561,7 +365,7 @@ fn prompt_gateway_request_to_llm_gateway() {

    let arch_fc_resp_str = serde_json::to_string(&arch_fc_resp).unwrap();
    module
-        .call_proxy_on_http_call_response(http_context, 4, 0, arch_fc_resp_str.len() as i32, 0)
+        .call_proxy_on_http_call_response(http_context, 1, 0, arch_fc_resp_str.len() as i32, 0)
        .expect_metric_increment("active_http_calls", -1)
        .expect_get_buffer_bytes(Some(BufferType::HttpCallResponseBody))
        .returning(Some(&arch_fc_resp_str))
@ -569,47 +373,7 @@ fn prompt_gateway_request_to_llm_gateway() {
        .expect_log(Some(LogLevel::Debug), None)
        .expect_log(Some(LogLevel::Debug), None)
        .expect_log(Some(LogLevel::Debug), None)
-        .expect_log(Some(LogLevel::Debug), None)
        .expect_log(Some(LogLevel::Trace), None)
-        .expect_http_call(
-            Some("arch_internal"),
-            Some(vec![
-                ("x-arch-upstream", "hallucination"),
-                (":method", "POST"),
-                (":path", "/hallucination"),
-                (":authority", "hallucination"),
-                ("content-type", "application/json"),
-                ("x-envoy-max-retries", "3"),
-                ("x-envoy-upstream-rq-timeout-ms", "60000"),
-            ]),
-            None,
-            None,
-            None,
-        )
-        .returning(Some(5))
-        .expect_metric_increment("active_http_calls", 1)
-        .execute_and_expect(ReturnType::None)
-        .unwrap();
-
-    // hallucination should return that parameters were not halliucinated
-    //     prompt: str
-    // parameters: dict
-    // model: str
-
-    let hallucatination_body = HallucinationClassificationResponse {
-        params_scores: HashMap::from([("city".to_string(), 0.99)]),
-        model: "nli-model".to_string(),
-    };
-
-    let body_text = serde_json::to_string(&hallucatination_body).unwrap();
-
-    module
-        .call_proxy_on_http_call_response(http_context, 5, 0, body_text.len() as i32, 0)
-        .expect_metric_increment("active_http_calls", -1)
-        .expect_get_buffer_bytes(Some(BufferType::HttpCallResponseBody))
-        .returning(Some(&body_text))
-        .expect_log(Some(LogLevel::Debug), None)
-        .expect_log(Some(LogLevel::Debug), None)
        .expect_log(Some(LogLevel::Trace), None)
        .expect_http_call(
            Some("arch_internal"),
@ -625,14 +389,14 @@ fn prompt_gateway_request_to_llm_gateway() {
            None,
            None,
        )
-        .returning(Some(6))
+        .returning(Some(2))
        .expect_metric_increment("active_http_calls", 1)
        .execute_and_expect(ReturnType::None)
        .unwrap();

    let body_text = String::from("test body");
    module
-        .call_proxy_on_http_call_response(http_context, 6, 0, body_text.len() as i32, 0)
+        .call_proxy_on_http_call_response(http_context, 2, 0, body_text.len() as i32, 0)
        .expect_metric_increment("active_http_calls", -1)
        .expect_get_buffer_bytes(Some(BufferType::HttpCallResponseBody))
        .returning(Some(&body_text))
@ -640,6 +404,10 @@ fn prompt_gateway_request_to_llm_gateway() {
        .expect_get_header_map_value(Some(MapType::HttpCallResponseHeaders), Some(":status"))
        .returning(Some("200"))
        .expect_log(Some(LogLevel::Debug), None)
+        .expect_log(Some(LogLevel::Debug), None)
+        .expect_log(Some(LogLevel::Debug), None)
+        .expect_log(Some(LogLevel::Debug), None)
+        .expect_log(Some(LogLevel::Debug), None)
        .expect_set_buffer_bytes(Some(BufferType::HttpRequestBody), None)
        .execute_and_expect(ReturnType::None)
        .unwrap();
@ -649,8 +417,8 @@ fn prompt_gateway_request_to_llm_gateway() {
            completion_tokens: 0,
        }),
        choices: vec![Choice {
-            finish_reason: "test".to_string(),
-            index: 0,
+            finish_reason: Some("test".to_string()),
+            index: Some(0),
            message: Message {
                role: "assistant".to_string(),
                content: Some("hello from fake llm gateway".to_string()),
--- a/demos/currency_exchange/run_demo.sh
+++ b/demos/currency_exchange/run_demo.sh
@ -22,7 +22,7 @@ start_demo() {
  echo "Starting Arch with arch_config.yaml..."
  archgw up arch_config.yaml

-  # Step 4: Start Network Agent
+  # Step 4: Start developer services
  echo "Starting Network Agent using Docker Compose..."
  docker compose up -d  # Run in detached mode
 }
--- a/demos/currency_exchange/test_data.yaml
+++ b/demos/currency_exchange/test_data.yaml
@ -0,0 +1,13 @@
+test_cases:
+  - id: "get exchange rate"
+    input:
+      messages:
+        - role: user
+          content: what is exchange rate for gbp
+    expected_tools:
+      - type: function
+        function:
+          name: currency_exchange
+          arguments:
+            currency_symbol: GBP
+    expected_output_contains: gbp
--- a/demos/hr_agent/docker-compose.yaml
+++ b/demos/hr_agent/docker-compose.yaml
@ -15,3 +15,16 @@ services:
        test: ["CMD", "curl" ,"http://localhost:80/healthz"]
        interval: 5s
        retries: 20
+
+  chatbot_ui:
+    build:
+      context: ../shared/chatbot_ui
+      dockerfile: Dockerfile
+    ports:
+      - "18080:8080"
+    environment:
+      - CHAT_COMPLETION_ENDPOINT=http://host.docker.internal:10000/v1
+    extra_hosts:
+      - "host.docker.internal:host-gateway"
+    volumes:
+      - ./arch_config.yaml:/app/arch_config.yaml
--- a/demos/hr_agent/main.py
+++ b/demos/hr_agent/main.py
@ -11,14 +11,9 @@ from typing import List, Optional, Tuple
 from slack_sdk import WebClient
 from slack_sdk.errors import SlackApiError
 from openai import OpenAI
-from common import create_gradio_app

 app = FastAPI()
 workforce_data_df = None
-demo_description = """This demo showcases how the **Arch** can be used to build an
-HR agent to manage workforce-related inquiries, workforce planning, and communication via Slack.
-It intelligently routes incoming prompts to the correct targets, providing concise and useful responses
-tailored for HR and workforce decision-making. """

 with open("workforce_data.json") as file:
    workforce_data = json.load(file)
@ -95,15 +90,5 @@ def get_workforce(request: WorkforceRequest):
    return response


-CHAT_COMPLETION_ENDPOINT = os.getenv("CHAT_COMPLETION_ENDPOINT")
-client = OpenAI(
-    api_key="--",
-    base_url=CHAT_COMPLETION_ENDPOINT,
-)
-
-gr.mount_gradio_app(
-    app, create_gradio_app(demo_description, client), path="/agent/chat"
-)
-
 if __name__ == "__main__":
    app.run(debug=True)
--- a/demos/hr_agent/test_data.yaml
+++ b/demos/hr_agent/test_data.yaml
@ -0,0 +1,14 @@
+test_cases:
+  - id: get workforce data
+    input:
+      messages:
+        - role: user
+          content: what is workforce data for asia for fte employees
+    expected_tools:
+      - type: function
+        function:
+          name: workforce
+          arguments:
+            staffing_type: fte
+            region: asia
+    expected_output_contains: asia
--- a/demos/insurance_agent/arch_config.yaml
+++ b/demos/insurance_agent/arch_config.yaml
@ -34,7 +34,7 @@ prompt_targets:
    default: true

  - name: get_policy_coverage
-    description: Retrieve the coverage details for a given policy type (car, boat, house, motorcycle).
+    description: Retrieve the coverage details for an insurance policy.
    endpoint:
      name: app_server
      path: /policy/coverage
@ -42,7 +42,7 @@ prompt_targets:
    parameters:
    - name: policy_type
      type: str
-      description: The type of policy, option - car, boat, house, motorcycle.
+      description: The type of policy
      default: car
      required: true

@ -51,11 +51,11 @@ prompt_targets:
      name: app_server
      path: /policy/initiate
      http_method: POST
-    description: Start a policy coverage for car, boat, motorcycle or house.
+    description: Start a policy coverage for an insurance policy
    parameters:
    - name: policy_type
      type: str
-      description: The type of policy, option - car, boat, house, motorcycle.
+      description: The type of policy
      default: car
      required: true
    - name: deductible
@ -84,11 +84,11 @@ prompt_targets:
      name: app_server
      path: /policy/deductible
      http_method: POST
-    description: Update the deductible amount for a specific policy coverage.
+    description: Update the deductible amount for a specific insurance policy coverage.
    parameters:
    - name: policy_id
      type: str
-      description: The id of the policy
+      description: The id of the insurance policy
      required: true
    - name: deductible
      type: float
--- a/demos/multi_turn_rag_agent/Dockerfile
+++ b/demos/multi_turn_rag_agent/Dockerfile
@ -0,0 +1,19 @@
+FROM python:3.12 AS base
+
+FROM base AS builder
+
+WORKDIR /src
+
+COPY requirements.txt /src/
+RUN pip install --prefix=/runtime --force-reinstall -r requirements.txt
+
+COPY . /src
+
+FROM python:3.12-slim AS output
+
+COPY --from=builder /runtime /usr/local
+
+COPY . /app
+WORKDIR /app
+
+CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "80", "--log-level", "info"]
--- a/demos/multi_turn_rag_agent/README.md
+++ b/demos/multi_turn_rag_agent/README.md
@ -0,0 +1,22 @@
+# Multi-Turn Agentic Demo (RAG)
+
+This demo showcases how the **Arch** can be used to build accurate multi-turn RAG agent by just writing simple APIs.
+
+![Example of Multi-turn Interaction](mutli-turn-example.png)
+
+### Energy Source Q/A
+Provides information about various energy sources and considerations.
+
+- **Endpoint**: `/agent/energy_source`
+- **Parameters**:
+  - `energy_source` (`str`, **required**): A source of energy (e.g., `renewable`, `fossil`).
+  - `consideration` (`str`, *optional*): A specific type of consideration for an energy source (e.g., `cost`, `economic`, `technology`).
+
+# Starting the demo
+1. Please make sure the [pre-requisites](https://github.com/katanemo/arch/?tab=readme-ov-file#prerequisites) are installed correctly
+2. Start Arch
+   ```sh
+   sh run_demo.sh
+   ```
+3. Navigate to http://localhost:18080
+4. Ask "give me information about renewable energy sources"
--- a/demos/multi_turn_rag_agent/arch_config.yaml
+++ b/demos/multi_turn_rag_agent/arch_config.yaml
@ -0,0 +1,59 @@
+version: v0.1
+
+listener:
+  address: 127.0.0.1
+  port: 10000
+  message_format: huggingface
+  connect_timeout: 0.005s
+
+endpoints:
+  rag_energy_source_agent:
+    endpoint: host.docker.internal:18083
+    connect_timeout: 0.005s
+
+llm_providers:
+  - name: gpt-4o-mini
+    access_key: $OPENAI_API_KEY
+    provider: openai
+    model: gpt-4o-mini
+    default: true
+
+system_prompt: |
+  You are a helpful assistant and can offer information about energy sources.
+  You will get a JSON object with energy_source and consideration fields. Focus on answering the querstion using those fields.
+  Keep your responses to just three main points to make it easy for the reader to digest the information
+
+prompt_targets:
+  - name: get_info_for_energy_source
+    description: get information about an energy source
+    parameters:
+      - name: energy_source
+        type: str
+        description: a source of energy
+        required: true
+        enum: [renewable, fossil]
+      - name: consideration
+        type: str
+        description: a specific type of consideration for an energy source
+        enum: [cost, economic, technology]
+    endpoint:
+      name: rag_energy_source_agent
+      path: /agent/energy_source_info
+      http_method: POST
+
+  - name: default_target
+    default: true
+    description: This is the default target for all unmatched prompts.
+    endpoint:
+      name: rag_energy_source_agent
+      path: /default_target
+      http_method: POST
+    system_prompt: |
+      You are a helpful assistant! Summarize the user's request and provide a helpful response.
+    # if it is set to false arch will send response that it received from this prompt target to the user
+    # if true arch will forward the response to the default LLM
+    auto_llm_dispatch_on_response: false
+
+tracing:
+  random_sampling: 100
+  trace_arch_internal: true
--- a/demos/multi_turn_rag_agent/docker-compose.yaml
+++ b/demos/multi_turn_rag_agent/docker-compose.yaml
@ -0,0 +1,24 @@
+services:
+  rag_energy_source_agent:
+    build:
+      context: .
+      dockerfile: Dockerfile
+    ports:
+      - "18083:80"
+    healthcheck:
+        test: ["CMD", "curl" ,"http://localhost:80/healthz"]
+        interval: 5s
+        retries: 20
+
+  chatbot_ui:
+    build:
+      context: ../shared/chatbot_ui
+      dockerfile: Dockerfile
+    ports:
+      - "18080:8080"
+    environment:
+      - CHAT_COMPLETION_ENDPOINT=http://host.docker.internal:10000/v1
+    extra_hosts:
+      - "host.docker.internal:host-gateway"
+    volumes:
+      - ./arch_config.yaml:/app/arch_config.yaml
--- a/demos/multi_turn_rag_agent/main.py
+++ b/demos/multi_turn_rag_agent/main.py
@ -0,0 +1,42 @@
+import os
+import gradio as gr
+
+from fastapi import FastAPI, HTTPException
+from pydantic import BaseModel
+from typing import Optional
+from openai import OpenAI
+
+app = FastAPI()
+
+
+# Define the request model
+class EnergySourceRequest(BaseModel):
+    energy_source: str
+    consideration: Optional[str] = None
+
+
+class EnergySourceResponse(BaseModel):
+    energy_source: str
+    consideration: Optional[str] = None
+
+
+# Post method for device summary
+@app.post("/agent/energy_source_info")
+def get_workforce(request: EnergySourceRequest):
+    """
+    Endpoint to get details about energy source
+    """
+    considertion = "You don't have any specific consideration. Feel free to talk in a more open ended fashion"
+
+    if request.consideration is not None:
+        considertion = f"Add specific focus on the following consideration when you summarize the content for the energy source: {request.consideration}"
+
+    response = {
+        "energy_source": request.energy_source,
+        "consideration": considertion,
+    }
+    return response
+
+
+if __name__ == "__main__":
+    app.run(debug=True)
--- a/demos/multi_turn_rag_agent/mutli-turn-example.png
+++ b/demos/multi_turn_rag_agent/mutli-turn-example.png
--- a/demos/multi_turn_rag_agent/requirements.txt
+++ b/demos/multi_turn_rag_agent/requirements.txt
@ -0,0 +1,12 @@
+fastapi
+uvicorn
+typing
+pandas
+gradio==5.3.0
+async_timeout==4.0.3
+loguru==0.7.2
+asyncio==3.4.3
+httpx==0.27.0
+python-dotenv==1.0.1
+pydantic==2.8.2
+openai==1.51.0
--- a/demos/multi_turn_rag_agent/run_demo.sh
+++ b/demos/multi_turn_rag_agent/run_demo.sh
@ -0,0 +1,47 @@
+#!/bin/bash
+set -e
+
+# Function to start the demo
+start_demo() {
+  # Step 1: Check if .env file exists
+  if [ -f ".env" ]; then
+    echo ".env file already exists. Skipping creation."
+  else
+    # Step 2: Create `.env` file and set OpenAI key
+    if [ -z "$OPENAI_API_KEY" ]; then
+      echo "Error: OPENAI_API_KEY environment variable is not set for the demo."
+      exit 1
+    fi
+
+    echo "Creating .env file..."
+    echo "OPENAI_API_KEY=$OPENAI_API_KEY" > .env
+    echo ".env file created with OPENAI_API_KEY."
+  fi
+
+  # Step 3: Start Arch
+  echo "Starting Arch with arch_config.yaml..."
+  archgw up arch_config.yaml
+
+  # Step 4: Start Network Agent
+  echo "Starting HR Agent using Docker Compose..."
+  docker compose up -d  # Run in detached mode
+}
+
+# Function to stop the demo
+stop_demo() {
+  # Step 1: Stop Docker Compose services
+  echo "Stopping HR Agent using Docker Compose..."
+  docker compose down -v
+
+  # Step 2: Stop Arch
+  echo "Stopping Arch..."
+  archgw down
+}
+
+# Main script logic
+if [ "$1" == "down" ]; then
+  stop_demo
+else
+  # Default action is to bring the demo up
+  start_demo
+fi
--- a/demos/network_agent/arch_config.yaml
+++ b/demos/network_agent/arch_config.yaml
@ -17,36 +17,32 @@ system_prompt: |
  You are a network assistant that helps operators with a better understanding of network traffic flow and perform actions on networking operations. No advice on manufacturers or purchasing decisions.

 prompt_targets:
-    - name: device_summary
-      description: Retrieve network statistics for specific devices within a time range
-      endpoint:
-        name: app_server
-        path: /agent/device_summary
-        http_method: POST
-      parameters:
-        - name: device_ids
-          type: list
-          description: A list of device identifiers (IDs) to retrieve statistics for.
-          required: true  # device_ids are required to get device statistics
-        - name: days
-          type: int
-          description: The number of days for which to gather device statistics.
-          default: "7"
-    - name: reboot_devices
-      description: Reboot a list of devices
-      endpoint:
-        name: app_server
-        path: /agent/device_reboot
-        http_method: POST
-      parameters:
-        - name: device_ids
-          type: list
-          description: A list of device identifiers (IDs).
-          required: true
-        - name: days
-          type: int
-          description: A list of device identifiers (IDs)
-          default: "7"
+  - name: device_summary
+    description: Retrieve network statistics for specific devices within a time range
+    endpoint:
+      name: app_server
+      path: /agent/device_summary
+      http_method: POST
+    parameters:
+      - name: device_ids
+        type: list
+        description: A list of device identifiers (IDs) to retrieve statistics for.
+        required: true # device_ids are required to get device statistics
+      - name: days
+        type: int
+        description: The number of days for which to gather device statistics.
+        default: "7"
+  - name: reboot_devices
+    description: Reboot a list of devices
+    endpoint:
+      name: app_server
+      path: /agent/device_reboot
+      http_method: POST
+    parameters:
+      - name: device_ids
+        type: list
+        description: A list of device identifiers (IDs).
+        required: true

 # Arch creates a round-robin load balancing between different endpoints, managed via the cluster subsystem.
 endpoints:
@ -54,6 +50,6 @@ endpoints:
    # value could be ip address or a hostname with port
    # this could also be a list of endpoints for load balancing
    # for example endpoint: [ ip1:port, ip2:port ]
-    endpoint: host.docker.internal:18080
+    endpoint: host.docker.internal:18083
    # max time to wait for a connection to be established
    connect_timeout: 0.005s
--- a/demos/network_agent/docker-compose.yaml
+++ b/demos/network_agent/docker-compose.yaml
@ -2,14 +2,19 @@ services:
  api_server:
    build:
      context: .
+      dockerfile: Dockerfile
+    ports:
+      - "18083:80"
+
+  chatbot_ui:
+    build:
+      context: ../shared/chatbot_ui
+      dockerfile: Dockerfile
+    ports:
+      - "18080:8080"
    environment:
      - CHAT_COMPLETION_ENDPOINT=http://host.docker.internal:10000/v1
+    extra_hosts:
+      - "host.docker.internal:host-gateway"
    volumes:
      - ./arch_config.yaml:/app/arch_config.yaml
-      - ../shared/chatbot_ui/common.py:/app/common.py
-    ports:
-      - "18080:80"
-    healthcheck:
-        test: ["CMD", "curl" ,"http://localhost:80/healthz"]
-        interval: 5s
-        retries: 20
--- a/demos/network_agent/main.py
+++ b/demos/network_agent/main.py
@ -1,15 +1,14 @@
-from openai import OpenAI
-from fastapi import FastAPI, HTTPException
-from pydantic import BaseModel, Field
-from typing import List, Optional
-from common import create_gradio_app
-import gradio as gr
 import os
+from typing import List, Optional

+from fastapi import FastAPI, HTTPException
+from openai import OpenAI
+from pydantic import BaseModel, Field

 app = FastAPI()
-demo_description = """This demo illustrates how **Arch** can be used to perform function calling with network-related tasks.
-In this demo, you act as a **network assistant** that provides factual information, without offering advice on manufacturers or purchasing decisions."""
+DEMO_DESCRIPTION = """This demo illustrates how **Arch** can be used to perform function calling
+ with network-related tasks. In this demo, you act as a **network assistant** that provides factual
+ information, without offering advice on manufacturers or purchasing decisions."""


 # Define the request model
@ -52,7 +51,8 @@ def reboot_network_device(request_data: DeviceRebootRequest):
    # Access data from the Pydantic model
    device_ids = request_data.device_ids

-    # Validate 'device_ids' (This is already validated by Pydantic, but additional logic can be added if needed)
+    # Validate 'device_ids'
+    # (This is already validated by Pydantic, but additional logic can be added if needed)
    if not device_ids:
        raise HTTPException(
            status_code=400, detail="'device_ids' parameter is required"
@ -87,7 +87,8 @@ def get_device_summary(request: DeviceSummaryRequest):
        stats = {
            "device_id": device_id,
            "time_range": f"Last {time_range} days",
-            "data": f"Device {device_id} over the last {time_range} days experienced {minutes} minutes of downtime.",
+            "data": f"""Device {device_id} over the last {time_range} days experienced {minutes}
+             minutes of downtime.""",
        }
        minutes += 1
        statistics.append(DeviceStatistics(**stats))
@ -100,10 +101,3 @@ client = OpenAI(
    api_key="--",
    base_url=CHAT_COMPLETION_ENDPOINT,
 )
-
-gr.mount_gradio_app(
-    app, create_gradio_app(demo_description, client), path="/agent/chat"
-)
-
-if __name__ == "__main__":
-    app.run(debug=True)
--- a/demos/network_agent/utils.py
+++ b/demos/network_agent/utils.py
@ -1,10 +1,11 @@
-import pandas as pd
-import random
-from datetime import datetime, timedelta, timezone
-import re
 import logging
-from dateparser import parse
+import random
+import re
 import sqlite3
+from datetime import datetime, timedelta, timezone
+
+import pandas as pd
+from dateparser import parse

 logging.basicConfig(
    level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
@ -12,7 +13,7 @@ logging.basicConfig(
 logger = logging.getLogger(__name__)


-def load_sql():
+def loadsql():
    # Example Usage
    conn = sqlite3.connect(":memory:")

@ -72,7 +73,10 @@ def random_mac():

 # Function to generate random IP addresses
 def random_ip():
-    return f"{random.randint(1, 255)}.{random.randint(1, 255)}.{random.randint(1, 255)}.{random.randint(1, 255)}"
+    return f"""{random.randint(1, 255)}
+    .{random.randint(1, 255)}
+    .{random.randint(1, 255)}
+    .{random.randint(1, 255)}"""


 # Generate synthetic data for the device table
@ -88,7 +92,8 @@ def generate_device_data(
        "layer": ["L2" if i % 2 == 0 else "L3" for i in range(n)],
        "region": [random.choice(["US", "EU", "ASIA"]) for _ in range(n)],
        "uptime": [
-            f"{random.randint(0, 10)} days {random.randint(0, 23)}:{random.randint(0, 59)}:{random.randint(0, 59)}"
+            f"""{random.randint(0, 10)} days {random.randint(0, 23)}
+            :{random.randint(0, 59)}:{random.randint(0, 59)}"""
            for _ in range(n)
        ],
        "device_mac_address": [random_mac() for _ in range(n)],
@ -129,7 +134,6 @@ def generate_interface_stats_data(conn, device_df, n=1000):
        )
    df = pd.DataFrame(interface_stats_data)
    df.to_sql("interfacestats", conn, index=False)
-    return


 # Generate synthetic data for the ts_flow table
@ -175,14 +179,13 @@ def generate_flow_data(conn, device_df, n=1000):
        )
    df = pd.DataFrame(flow_data)
    df.to_sql("ts_flow", conn, index=False)
-    return


 def load_params(req):
    # Step 1: Convert the from_time natural language string to a timestamp if provided
    if req.from_time:
        # Use `dateparser` to parse natural language timeframes
-        logger.info(f"{'* ' * 50}\n\nCaptured from time: {req.from_time}\n\n")
+        logger.info("%s\n\nCaptured from time: %s\n\n", "* " * 50, req.from_time)
        parsed_time = parse(req.from_time, settings={"RELATIVE_BASE": datetime.now()})
        if not parsed_time:
            conv_time = convert_to_ago_format(req.from_time)
@ -192,15 +195,16 @@ def load_params(req):
                )
            else:
                return {
-                    "error": "Invalid from_time format. Please provide a valid time description such as 'past 7 days' or 'since last month'."
+                    "error": """Invalid from_time format. Please provide a valid time description
+                    such as 'past 7 days' or 'since last month'."""
                }
-        logger.info(f"\n\nConverted from time: {parsed_time}\n\n{'* ' * 50}\n\n")
+        logger.info("\n\nConverted from time: %s\n\n%s\n\n", parsed_time, "* " * 50)
        from_time = parsed_time
-        logger.info(f"Using parsed from_time: {from_time}")
+        logger.info("Using parsed from_time: %f", from_time)
    else:
        # If no from_time is provided, use a default value (e.g., the past 7 days)
        from_time = datetime.now() - timedelta(days=7)
-        logger.info(f"Using default from_time: {from_time}")
+        logger.info("Using default from_time: %f", from_time)

    # Step 2: Build the dynamic SQL query based on the optional filters
    filters = []
--- a/demos/test_runner/common.py
+++ b/demos/test_runner/common.py
@ -1,25 +1,8 @@
 import json
-import os


-PROMPT_GATEWAY_ENDPOINT = os.getenv(
-    "PROMPT_GATEWAY_ENDPOINT", "http://localhost:10000/v1/chat/completions"
-)
-LLM_GATEWAY_ENDPOINT = os.getenv(
-    "LLM_GATEWAY_ENDPOINT", "http://localhost:12000/v1/chat/completions"
-)
 ARCH_STATE_HEADER = "x-arch-state"

-PREFILL_LIST = [
-    "May",
-    "Could",
-    "Sure",
-    "Definitely",
-    "Certainly",
-    "Of course",
-    "Can",
-]
-

 def get_data_chunks(stream, n=1):
    chunks = []
--- a/demos/test_runner/poetry.lock
+++ b/demos/test_runner/poetry.lock
@ -0,0 +1,484 @@
+# This file is automatically @generated by Poetry 1.8.4 and should not be changed by hand.
+
+[[package]]
+name = "certifi"
+version = "2024.12.14"
+description = "Python package for providing Mozilla's CA Bundle."
+optional = false
+python-versions = ">=3.6"
+files = [
+    {file = "certifi-2024.12.14-py3-none-any.whl", hash = "sha256:1275f7a45be9464efc1173084eaa30f866fe2e47d389406136d332ed4967ec56"},
+    {file = "certifi-2024.12.14.tar.gz", hash = "sha256:b650d30f370c2b724812bee08008be0c4163b163ddaec3f2546c1caf65f191db"},
+]
+
+[[package]]
+name = "charset-normalizer"
+version = "3.4.0"
+description = "The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet."
+optional = false
+python-versions = ">=3.7.0"
+files = [
+    {file = "charset_normalizer-3.4.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:4f9fc98dad6c2eaa32fc3af1417d95b5e3d08aff968df0cd320066def971f9a6"},
+    {file = "charset_normalizer-3.4.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:0de7b687289d3c1b3e8660d0741874abe7888100efe14bd0f9fd7141bcbda92b"},
+    {file = "charset_normalizer-3.4.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:5ed2e36c3e9b4f21dd9422f6893dec0abf2cca553af509b10cd630f878d3eb99"},
+    {file = "charset_normalizer-3.4.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:40d3ff7fc90b98c637bda91c89d51264a3dcf210cade3a2c6f838c7268d7a4ca"},
+    {file = "charset_normalizer-3.4.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1110e22af8ca26b90bd6364fe4c763329b0ebf1ee213ba32b68c73de5752323d"},
+    {file = "charset_normalizer-3.4.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:86f4e8cca779080f66ff4f191a685ced73d2f72d50216f7112185dc02b90b9b7"},
+    {file = "charset_normalizer-3.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7f683ddc7eedd742e2889d2bfb96d69573fde1d92fcb811979cdb7165bb9c7d3"},
+    {file = "charset_normalizer-3.4.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:27623ba66c183eca01bf9ff833875b459cad267aeeb044477fedac35e19ba907"},
+    {file = "charset_normalizer-3.4.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:f606a1881d2663630ea5b8ce2efe2111740df4b687bd78b34a8131baa007f79b"},
+    {file = "charset_normalizer-3.4.0-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:0b309d1747110feb25d7ed6b01afdec269c647d382c857ef4663bbe6ad95a912"},
+    {file = "charset_normalizer-3.4.0-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:136815f06a3ae311fae551c3df1f998a1ebd01ddd424aa5603a4336997629e95"},
+    {file = "charset_normalizer-3.4.0-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:14215b71a762336254351b00ec720a8e85cada43b987da5a042e4ce3e82bd68e"},
+    {file = "charset_normalizer-3.4.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:79983512b108e4a164b9c8d34de3992f76d48cadc9554c9e60b43f308988aabe"},
+    {file = "charset_normalizer-3.4.0-cp310-cp310-win32.whl", hash = "sha256:c94057af19bc953643a33581844649a7fdab902624d2eb739738a30e2b3e60fc"},
+    {file = "charset_normalizer-3.4.0-cp310-cp310-win_amd64.whl", hash = "sha256:55f56e2ebd4e3bc50442fbc0888c9d8c94e4e06a933804e2af3e89e2f9c1c749"},
+    {file = "charset_normalizer-3.4.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:0d99dd8ff461990f12d6e42c7347fd9ab2532fb70e9621ba520f9e8637161d7c"},
+    {file = "charset_normalizer-3.4.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:c57516e58fd17d03ebe67e181a4e4e2ccab1168f8c2976c6a334d4f819fe5944"},
+    {file = "charset_normalizer-3.4.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:6dba5d19c4dfab08e58d5b36304b3f92f3bd5d42c1a3fa37b5ba5cdf6dfcbcee"},
+    {file = "charset_normalizer-3.4.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bf4475b82be41b07cc5e5ff94810e6a01f276e37c2d55571e3fe175e467a1a1c"},
+    {file = "charset_normalizer-3.4.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ce031db0408e487fd2775d745ce30a7cd2923667cf3b69d48d219f1d8f5ddeb6"},
+    {file = "charset_normalizer-3.4.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:8ff4e7cdfdb1ab5698e675ca622e72d58a6fa2a8aa58195de0c0061288e6e3ea"},
+    {file = "charset_normalizer-3.4.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3710a9751938947e6327ea9f3ea6332a09bf0ba0c09cae9cb1f250bd1f1549bc"},
+    {file = "charset_normalizer-3.4.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:82357d85de703176b5587dbe6ade8ff67f9f69a41c0733cf2425378b49954de5"},
+    {file = "charset_normalizer-3.4.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:47334db71978b23ebcf3c0f9f5ee98b8d65992b65c9c4f2d34c2eaf5bcaf0594"},
+    {file = "charset_normalizer-3.4.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:8ce7fd6767a1cc5a92a639b391891bf1c268b03ec7e021c7d6d902285259685c"},
+    {file = "charset_normalizer-3.4.0-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:f1a2f519ae173b5b6a2c9d5fa3116ce16e48b3462c8b96dfdded11055e3d6365"},
+    {file = "charset_normalizer-3.4.0-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:63bc5c4ae26e4bc6be6469943b8253c0fd4e4186c43ad46e713ea61a0ba49129"},
+    {file = "charset_normalizer-3.4.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:bcb4f8ea87d03bc51ad04add8ceaf9b0f085ac045ab4d74e73bbc2dc033f0236"},
+    {file = "charset_normalizer-3.4.0-cp311-cp311-win32.whl", hash = "sha256:9ae4ef0b3f6b41bad6366fb0ea4fc1d7ed051528e113a60fa2a65a9abb5b1d99"},
+    {file = "charset_normalizer-3.4.0-cp311-cp311-win_amd64.whl", hash = "sha256:cee4373f4d3ad28f1ab6290684d8e2ebdb9e7a1b74fdc39e4c211995f77bec27"},
+    {file = "charset_normalizer-3.4.0-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:0713f3adb9d03d49d365b70b84775d0a0d18e4ab08d12bc46baa6132ba78aaf6"},
+    {file = "charset_normalizer-3.4.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:de7376c29d95d6719048c194a9cf1a1b0393fbe8488a22008610b0361d834ecf"},
+    {file = "charset_normalizer-3.4.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:4a51b48f42d9358460b78725283f04bddaf44a9358197b889657deba38f329db"},
+    {file = "charset_normalizer-3.4.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b295729485b06c1a0683af02a9e42d2caa9db04a373dc38a6a58cdd1e8abddf1"},
+    {file = "charset_normalizer-3.4.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ee803480535c44e7f5ad00788526da7d85525cfefaf8acf8ab9a310000be4b03"},
+    {file = "charset_normalizer-3.4.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3d59d125ffbd6d552765510e3f31ed75ebac2c7470c7274195b9161a32350284"},
+    {file = "charset_normalizer-3.4.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8cda06946eac330cbe6598f77bb54e690b4ca93f593dee1568ad22b04f347c15"},
+    {file = "charset_normalizer-3.4.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:07afec21bbbbf8a5cc3651aa96b980afe2526e7f048fdfb7f1014d84acc8b6d8"},
+    {file = "charset_normalizer-3.4.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:6b40e8d38afe634559e398cc32b1472f376a4099c75fe6299ae607e404c033b2"},
+    {file = "charset_normalizer-3.4.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:b8dcd239c743aa2f9c22ce674a145e0a25cb1566c495928440a181ca1ccf6719"},
+    {file = "charset_normalizer-3.4.0-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:84450ba661fb96e9fd67629b93d2941c871ca86fc38d835d19d4225ff946a631"},
+    {file = "charset_normalizer-3.4.0-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:44aeb140295a2f0659e113b31cfe92c9061622cadbc9e2a2f7b8ef6b1e29ef4b"},
+    {file = "charset_normalizer-3.4.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:1db4e7fefefd0f548d73e2e2e041f9df5c59e178b4c72fbac4cc6f535cfb1565"},
+    {file = "charset_normalizer-3.4.0-cp312-cp312-win32.whl", hash = "sha256:5726cf76c982532c1863fb64d8c6dd0e4c90b6ece9feb06c9f202417a31f7dd7"},
+    {file = "charset_normalizer-3.4.0-cp312-cp312-win_amd64.whl", hash = "sha256:b197e7094f232959f8f20541ead1d9862ac5ebea1d58e9849c1bf979255dfac9"},
+    {file = "charset_normalizer-3.4.0-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:dd4eda173a9fcccb5f2e2bd2a9f423d180194b1bf17cf59e3269899235b2a114"},
+    {file = "charset_normalizer-3.4.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:e9e3c4c9e1ed40ea53acf11e2a386383c3304212c965773704e4603d589343ed"},
+    {file = "charset_normalizer-3.4.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:92a7e36b000bf022ef3dbb9c46bfe2d52c047d5e3f3343f43204263c5addc250"},
+    {file = "charset_normalizer-3.4.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:54b6a92d009cbe2fb11054ba694bc9e284dad30a26757b1e372a1fdddaf21920"},
+    {file = "charset_normalizer-3.4.0-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1ffd9493de4c922f2a38c2bf62b831dcec90ac673ed1ca182fe11b4d8e9f2a64"},
+    {file = "charset_normalizer-3.4.0-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:35c404d74c2926d0287fbd63ed5d27eb911eb9e4a3bb2c6d294f3cfd4a9e0c23"},
+    {file = "charset_normalizer-3.4.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4796efc4faf6b53a18e3d46343535caed491776a22af773f366534056c4e1fbc"},
+    {file = "charset_normalizer-3.4.0-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e7fdd52961feb4c96507aa649550ec2a0d527c086d284749b2f582f2d40a2e0d"},
+    {file = "charset_normalizer-3.4.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:92db3c28b5b2a273346bebb24857fda45601aef6ae1c011c0a997106581e8a88"},
+    {file = "charset_normalizer-3.4.0-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:ab973df98fc99ab39080bfb0eb3a925181454d7c3ac8a1e695fddfae696d9e90"},
+    {file = "charset_normalizer-3.4.0-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:4b67fdab07fdd3c10bb21edab3cbfe8cf5696f453afce75d815d9d7223fbe88b"},
+    {file = "charset_normalizer-3.4.0-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:aa41e526a5d4a9dfcfbab0716c7e8a1b215abd3f3df5a45cf18a12721d31cb5d"},
+    {file = "charset_normalizer-3.4.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:ffc519621dce0c767e96b9c53f09c5d215578e10b02c285809f76509a3931482"},
+    {file = "charset_normalizer-3.4.0-cp313-cp313-win32.whl", hash = "sha256:f19c1585933c82098c2a520f8ec1227f20e339e33aca8fa6f956f6691b784e67"},
+    {file = "charset_normalizer-3.4.0-cp313-cp313-win_amd64.whl", hash = "sha256:707b82d19e65c9bd28b81dde95249b07bf9f5b90ebe1ef17d9b57473f8a64b7b"},
+    {file = "charset_normalizer-3.4.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:dbe03226baf438ac4fda9e2d0715022fd579cb641c4cf639fa40d53b2fe6f3e2"},
+    {file = "charset_normalizer-3.4.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:dd9a8bd8900e65504a305bf8ae6fa9fbc66de94178c420791d0293702fce2df7"},
+    {file = "charset_normalizer-3.4.0-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b8831399554b92b72af5932cdbbd4ddc55c55f631bb13ff8fe4e6536a06c5c51"},
+    {file = "charset_normalizer-3.4.0-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a14969b8691f7998e74663b77b4c36c0337cb1df552da83d5c9004a93afdb574"},
+    {file = "charset_normalizer-3.4.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dcaf7c1524c0542ee2fc82cc8ec337f7a9f7edee2532421ab200d2b920fc97cf"},
+    {file = "charset_normalizer-3.4.0-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:425c5f215d0eecee9a56cdb703203dda90423247421bf0d67125add85d0c4455"},
+    {file = "charset_normalizer-3.4.0-cp37-cp37m-musllinux_1_2_aarch64.whl", hash = "sha256:d5b054862739d276e09928de37c79ddeec42a6e1bfc55863be96a36ba22926f6"},
+    {file = "charset_normalizer-3.4.0-cp37-cp37m-musllinux_1_2_i686.whl", hash = "sha256:f3e73a4255342d4eb26ef6df01e3962e73aa29baa3124a8e824c5d3364a65748"},
+    {file = "charset_normalizer-3.4.0-cp37-cp37m-musllinux_1_2_ppc64le.whl", hash = "sha256:2f6c34da58ea9c1a9515621f4d9ac379871a8f21168ba1b5e09d74250de5ad62"},
+    {file = "charset_normalizer-3.4.0-cp37-cp37m-musllinux_1_2_s390x.whl", hash = "sha256:f09cb5a7bbe1ecae6e87901a2eb23e0256bb524a79ccc53eb0b7629fbe7677c4"},
+    {file = "charset_normalizer-3.4.0-cp37-cp37m-musllinux_1_2_x86_64.whl", hash = "sha256:0099d79bdfcf5c1f0c2c72f91516702ebf8b0b8ddd8905f97a8aecf49712c621"},
+    {file = "charset_normalizer-3.4.0-cp37-cp37m-win32.whl", hash = "sha256:9c98230f5042f4945f957d006edccc2af1e03ed5e37ce7c373f00a5a4daa6149"},
+    {file = "charset_normalizer-3.4.0-cp37-cp37m-win_amd64.whl", hash = "sha256:62f60aebecfc7f4b82e3f639a7d1433a20ec32824db2199a11ad4f5e146ef5ee"},
+    {file = "charset_normalizer-3.4.0-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:af73657b7a68211996527dbfeffbb0864e043d270580c5aef06dc4b659a4b578"},
+    {file = "charset_normalizer-3.4.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:cab5d0b79d987c67f3b9e9c53f54a61360422a5a0bc075f43cab5621d530c3b6"},
+    {file = "charset_normalizer-3.4.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:9289fd5dddcf57bab41d044f1756550f9e7cf0c8e373b8cdf0ce8773dc4bd417"},
+    {file = "charset_normalizer-3.4.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6b493a043635eb376e50eedf7818f2f322eabbaa974e948bd8bdd29eb7ef2a51"},
+    {file = "charset_normalizer-3.4.0-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9fa2566ca27d67c86569e8c85297aaf413ffab85a8960500f12ea34ff98e4c41"},
+    {file = "charset_normalizer-3.4.0-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a8e538f46104c815be19c975572d74afb53f29650ea2025bbfaef359d2de2f7f"},
+    {file = "charset_normalizer-3.4.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6fd30dc99682dc2c603c2b315bded2799019cea829f8bf57dc6b61efde6611c8"},
+    {file = "charset_normalizer-3.4.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2006769bd1640bdf4d5641c69a3d63b71b81445473cac5ded39740a226fa88ab"},
+    {file = "charset_normalizer-3.4.0-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:dc15e99b2d8a656f8e666854404f1ba54765871104e50c8e9813af8a7db07f12"},
+    {file = "charset_normalizer-3.4.0-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:ab2e5bef076f5a235c3774b4f4028a680432cded7cad37bba0fd90d64b187d19"},
+    {file = "charset_normalizer-3.4.0-cp38-cp38-musllinux_1_2_ppc64le.whl", hash = "sha256:4ec9dd88a5b71abfc74e9df5ebe7921c35cbb3b641181a531ca65cdb5e8e4dea"},
+    {file = "charset_normalizer-3.4.0-cp38-cp38-musllinux_1_2_s390x.whl", hash = "sha256:43193c5cda5d612f247172016c4bb71251c784d7a4d9314677186a838ad34858"},
+    {file = "charset_normalizer-3.4.0-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:aa693779a8b50cd97570e5a0f343538a8dbd3e496fa5dcb87e29406ad0299654"},
+    {file = "charset_normalizer-3.4.0-cp38-cp38-win32.whl", hash = "sha256:7706f5850360ac01d80c89bcef1640683cc12ed87f42579dab6c5d3ed6888613"},
+    {file = "charset_normalizer-3.4.0-cp38-cp38-win_amd64.whl", hash = "sha256:c3e446d253bd88f6377260d07c895816ebf33ffffd56c1c792b13bff9c3e1ade"},
+    {file = "charset_normalizer-3.4.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:980b4f289d1d90ca5efcf07958d3eb38ed9c0b7676bf2831a54d4f66f9c27dfa"},
+    {file = "charset_normalizer-3.4.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:f28f891ccd15c514a0981f3b9db9aa23d62fe1a99997512b0491d2ed323d229a"},
+    {file = "charset_normalizer-3.4.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:a8aacce6e2e1edcb6ac625fb0f8c3a9570ccc7bfba1f63419b3769ccf6a00ed0"},
+    {file = "charset_normalizer-3.4.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bd7af3717683bea4c87acd8c0d3d5b44d56120b26fd3f8a692bdd2d5260c620a"},
+    {file = "charset_normalizer-3.4.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5ff2ed8194587faf56555927b3aa10e6fb69d931e33953943bc4f837dfee2242"},
+    {file = "charset_normalizer-3.4.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e91f541a85298cf35433bf66f3fab2a4a2cff05c127eeca4af174f6d497f0d4b"},
+    {file = "charset_normalizer-3.4.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:309a7de0a0ff3040acaebb35ec45d18db4b28232f21998851cfa709eeff49d62"},
+    {file = "charset_normalizer-3.4.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:285e96d9d53422efc0d7a17c60e59f37fbf3dfa942073f666db4ac71e8d726d0"},
+    {file = "charset_normalizer-3.4.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:5d447056e2ca60382d460a604b6302d8db69476fd2015c81e7c35417cfabe4cd"},
+    {file = "charset_normalizer-3.4.0-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:20587d20f557fe189b7947d8e7ec5afa110ccf72a3128d61a2a387c3313f46be"},
+    {file = "charset_normalizer-3.4.0-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:130272c698667a982a5d0e626851ceff662565379baf0ff2cc58067b81d4f11d"},
+    {file = "charset_normalizer-3.4.0-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:ab22fbd9765e6954bc0bcff24c25ff71dcbfdb185fcdaca49e81bac68fe724d3"},
+    {file = "charset_normalizer-3.4.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:7782afc9b6b42200f7362858f9e73b1f8316afb276d316336c0ec3bd73312742"},
+    {file = "charset_normalizer-3.4.0-cp39-cp39-win32.whl", hash = "sha256:2de62e8801ddfff069cd5c504ce3bc9672b23266597d4e4f50eda28846c322f2"},
+    {file = "charset_normalizer-3.4.0-cp39-cp39-win_amd64.whl", hash = "sha256:95c3c157765b031331dd4db3c775e58deaee050a3042fcad72cbc4189d7c8dca"},
+    {file = "charset_normalizer-3.4.0-py3-none-any.whl", hash = "sha256:fe9f97feb71aa9896b81973a7bbada8c49501dc73e58a10fcef6663af95e5079"},
+    {file = "charset_normalizer-3.4.0.tar.gz", hash = "sha256:223217c3d4f82c3ac5e29032b3f1c2eb0fb591b72161f86d93f5719079dae93e"},
+]
+
+[[package]]
+name = "colorama"
+version = "0.4.6"
+description = "Cross-platform colored terminal text."
+optional = false
+python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7"
+files = [
+    {file = "colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6"},
+    {file = "colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44"},
+]
+
+[[package]]
+name = "coverage"
+version = "7.6.9"
+description = "Code coverage measurement for Python"
+optional = false
+python-versions = ">=3.9"
+files = [
+    {file = "coverage-7.6.9-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:85d9636f72e8991a1706b2b55b06c27545448baf9f6dbf51c4004609aacd7dcb"},
+    {file = "coverage-7.6.9-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:608a7fd78c67bee8936378299a6cb9f5149bb80238c7a566fc3e6717a4e68710"},
+    {file = "coverage-7.6.9-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:96d636c77af18b5cb664ddf12dab9b15a0cfe9c0bde715da38698c8cea748bfa"},
+    {file = "coverage-7.6.9-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d75cded8a3cff93da9edc31446872d2997e327921d8eed86641efafd350e1df1"},
+    {file = "coverage-7.6.9-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f7b15f589593110ae767ce997775d645b47e5cbbf54fd322f8ebea6277466cec"},
+    {file = "coverage-7.6.9-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:44349150f6811b44b25574839b39ae35291f6496eb795b7366fef3bd3cf112d3"},
+    {file = "coverage-7.6.9-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:d891c136b5b310d0e702e186d70cd16d1119ea8927347045124cb286b29297e5"},
+    {file = "coverage-7.6.9-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:db1dab894cc139f67822a92910466531de5ea6034ddfd2b11c0d4c6257168073"},
+    {file = "coverage-7.6.9-cp310-cp310-win32.whl", hash = "sha256:41ff7b0da5af71a51b53f501a3bac65fb0ec311ebed1632e58fc6107f03b9198"},
+    {file = "coverage-7.6.9-cp310-cp310-win_amd64.whl", hash = "sha256:35371f8438028fdccfaf3570b31d98e8d9eda8bb1d6ab9473f5a390969e98717"},
+    {file = "coverage-7.6.9-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:932fc826442132dde42ee52cf66d941f581c685a6313feebed358411238f60f9"},
+    {file = "coverage-7.6.9-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:085161be5f3b30fd9b3e7b9a8c301f935c8313dcf928a07b116324abea2c1c2c"},
+    {file = "coverage-7.6.9-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ccc660a77e1c2bf24ddbce969af9447a9474790160cfb23de6be4fa88e3951c7"},
+    {file = "coverage-7.6.9-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c69e42c892c018cd3c8d90da61d845f50a8243062b19d228189b0224150018a9"},
+    {file = "coverage-7.6.9-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0824a28ec542a0be22f60c6ac36d679e0e262e5353203bea81d44ee81fe9c6d4"},
+    {file = "coverage-7.6.9-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:4401ae5fc52ad8d26d2a5d8a7428b0f0c72431683f8e63e42e70606374c311a1"},
+    {file = "coverage-7.6.9-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:98caba4476a6c8d59ec1eb00c7dd862ba9beca34085642d46ed503cc2d440d4b"},
+    {file = "coverage-7.6.9-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:ee5defd1733fd6ec08b168bd4f5387d5b322f45ca9e0e6c817ea6c4cd36313e3"},
+    {file = "coverage-7.6.9-cp311-cp311-win32.whl", hash = "sha256:f2d1ec60d6d256bdf298cb86b78dd715980828f50c46701abc3b0a2b3f8a0dc0"},
+    {file = "coverage-7.6.9-cp311-cp311-win_amd64.whl", hash = "sha256:0d59fd927b1f04de57a2ba0137166d31c1a6dd9e764ad4af552912d70428c92b"},
+    {file = "coverage-7.6.9-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:99e266ae0b5d15f1ca8d278a668df6f51cc4b854513daab5cae695ed7b721cf8"},
+    {file = "coverage-7.6.9-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:9901d36492009a0a9b94b20e52ebfc8453bf49bb2b27bca2c9706f8b4f5a554a"},
+    {file = "coverage-7.6.9-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:abd3e72dd5b97e3af4246cdada7738ef0e608168de952b837b8dd7e90341f015"},
+    {file = "coverage-7.6.9-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ff74026a461eb0660366fb01c650c1d00f833a086b336bdad7ab00cc952072b3"},
+    {file = "coverage-7.6.9-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:65dad5a248823a4996724a88eb51d4b31587aa7aa428562dbe459c684e5787ae"},
+    {file = "coverage-7.6.9-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:22be16571504c9ccea919fcedb459d5ab20d41172056206eb2994e2ff06118a4"},
+    {file = "coverage-7.6.9-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:0f957943bc718b87144ecaee70762bc2bc3f1a7a53c7b861103546d3a403f0a6"},
+    {file = "coverage-7.6.9-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:0ae1387db4aecb1f485fb70a6c0148c6cdaebb6038f1d40089b1fc84a5db556f"},
+    {file = "coverage-7.6.9-cp312-cp312-win32.whl", hash = "sha256:1a330812d9cc7ac2182586f6d41b4d0fadf9be9049f350e0efb275c8ee8eb692"},
+    {file = "coverage-7.6.9-cp312-cp312-win_amd64.whl", hash = "sha256:b12c6b18269ca471eedd41c1b6a1065b2f7827508edb9a7ed5555e9a56dcfc97"},
+    {file = "coverage-7.6.9-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:899b8cd4781c400454f2f64f7776a5d87bbd7b3e7f7bda0cb18f857bb1334664"},
+    {file = "coverage-7.6.9-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:61f70dc68bd36810972e55bbbe83674ea073dd1dcc121040a08cdf3416c5349c"},
+    {file = "coverage-7.6.9-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8a289d23d4c46f1a82d5db4abeb40b9b5be91731ee19a379d15790e53031c014"},
+    {file = "coverage-7.6.9-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7e216d8044a356fc0337c7a2a0536d6de07888d7bcda76febcb8adc50bdbbd00"},
+    {file = "coverage-7.6.9-cp313-cp313-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3c026eb44f744acaa2bda7493dad903aa5bf5fc4f2554293a798d5606710055d"},
+    {file = "coverage-7.6.9-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:e77363e8425325384f9d49272c54045bbed2f478e9dd698dbc65dbc37860eb0a"},
+    {file = "coverage-7.6.9-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:777abfab476cf83b5177b84d7486497e034eb9eaea0d746ce0c1268c71652077"},
+    {file = "coverage-7.6.9-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:447af20e25fdbe16f26e84eb714ba21d98868705cb138252d28bc400381f6ffb"},
+    {file = "coverage-7.6.9-cp313-cp313-win32.whl", hash = "sha256:d872ec5aeb086cbea771c573600d47944eea2dcba8be5f3ee649bfe3cb8dc9ba"},
+    {file = "coverage-7.6.9-cp313-cp313-win_amd64.whl", hash = "sha256:fd1213c86e48dfdc5a0cc676551db467495a95a662d2396ecd58e719191446e1"},
+    {file = "coverage-7.6.9-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:ba9e7484d286cd5a43744e5f47b0b3fb457865baf07bafc6bee91896364e1419"},
+    {file = "coverage-7.6.9-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:e5ea1cf0872ee455c03e5674b5bca5e3e68e159379c1af0903e89f5eba9ccc3a"},
+    {file = "coverage-7.6.9-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2d10e07aa2b91835d6abec555ec8b2733347956991901eea6ffac295f83a30e4"},
+    {file = "coverage-7.6.9-cp313-cp313t-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:13a9e2d3ee855db3dd6ea1ba5203316a1b1fd8eaeffc37c5b54987e61e4194ae"},
+    {file = "coverage-7.6.9-cp313-cp313t-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9c38bf15a40ccf5619fa2fe8f26106c7e8e080d7760aeccb3722664c8656b030"},
+    {file = "coverage-7.6.9-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:d5275455b3e4627c8e7154feaf7ee0743c2e7af82f6e3b561967b1cca755a0be"},
+    {file = "coverage-7.6.9-cp313-cp313t-musllinux_1_2_i686.whl", hash = "sha256:8f8770dfc6e2c6a2d4569f411015c8d751c980d17a14b0530da2d7f27ffdd88e"},
+    {file = "coverage-7.6.9-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:8d2dfa71665a29b153a9681edb1c8d9c1ea50dfc2375fb4dac99ea7e21a0bcd9"},
+    {file = "coverage-7.6.9-cp313-cp313t-win32.whl", hash = "sha256:5e6b86b5847a016d0fbd31ffe1001b63355ed309651851295315031ea7eb5a9b"},
+    {file = "coverage-7.6.9-cp313-cp313t-win_amd64.whl", hash = "sha256:97ddc94d46088304772d21b060041c97fc16bdda13c6c7f9d8fcd8d5ae0d8611"},
+    {file = "coverage-7.6.9-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:adb697c0bd35100dc690de83154627fbab1f4f3c0386df266dded865fc50a902"},
+    {file = "coverage-7.6.9-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:be57b6d56e49c2739cdf776839a92330e933dd5e5d929966fbbd380c77f060be"},
+    {file = "coverage-7.6.9-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f1592791f8204ae9166de22ba7e6705fa4ebd02936c09436a1bb85aabca3e599"},
+    {file = "coverage-7.6.9-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4e12ae8cc979cf83d258acb5e1f1cf2f3f83524d1564a49d20b8bec14b637f08"},
+    {file = "coverage-7.6.9-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bb5555cff66c4d3d6213a296b360f9e1a8e323e74e0426b6c10ed7f4d021e464"},
+    {file = "coverage-7.6.9-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:b9389a429e0e5142e69d5bf4a435dd688c14478a19bb901735cdf75e57b13845"},
+    {file = "coverage-7.6.9-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:592ac539812e9b46046620341498caf09ca21023c41c893e1eb9dbda00a70cbf"},
+    {file = "coverage-7.6.9-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:a27801adef24cc30871da98a105f77995e13a25a505a0161911f6aafbd66e678"},
+    {file = "coverage-7.6.9-cp39-cp39-win32.whl", hash = "sha256:8e3c3e38930cfb729cb8137d7f055e5a473ddaf1217966aa6238c88bd9fd50e6"},
+    {file = "coverage-7.6.9-cp39-cp39-win_amd64.whl", hash = "sha256:e28bf44afa2b187cc9f41749138a64435bf340adfcacb5b2290c070ce99839d4"},
+    {file = "coverage-7.6.9-pp39.pp310-none-any.whl", hash = "sha256:f3ca78518bc6bc92828cd11867b121891d75cae4ea9e908d72030609b996db1b"},
+    {file = "coverage-7.6.9.tar.gz", hash = "sha256:4a8d8977b0c6ef5aeadcb644da9e69ae0dcfe66ec7f368c89c72e058bd71164d"},
+]
+
+[package.extras]
+toml = ["tomli"]
+
+[[package]]
+name = "deepdiff"
+version = "8.1.1"
+description = "Deep Difference and Search of any Python object/data. Recreate objects by adding adding deltas to each other."
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "deepdiff-8.1.1-py3-none-any.whl", hash = "sha256:b0231fa3afb0f7184e82535f2b4a36636442ed21e94a0cf3aaa7982157e7ebca"},
+    {file = "deepdiff-8.1.1.tar.gz", hash = "sha256:dd7bc7d5c8b51b5b90f01b0e2fe23c801fd8b4c6a7ee7e31c5a3c3663fcc7ceb"},
+]
+
+[package.dependencies]
+orderly-set = ">=5.2.3,<6"
+
+[package.extras]
+cli = ["click (==8.1.7)", "pyyaml (==6.0.2)"]
+optimize = ["orjson"]
+
+[[package]]
+name = "idna"
+version = "3.10"
+description = "Internationalized Domain Names in Applications (IDNA)"
+optional = false
+python-versions = ">=3.6"
+files = [
+    {file = "idna-3.10-py3-none-any.whl", hash = "sha256:946d195a0d259cbba61165e88e65941f16e9b36ea6ddb97f00452bae8b1287d3"},
+    {file = "idna-3.10.tar.gz", hash = "sha256:12f65c9b470abda6dc35cf8e63cc574b1c52b11df2c86030af0ac09b01b13ea9"},
+]
+
+[package.extras]
+all = ["flake8 (>=7.1.1)", "mypy (>=1.11.2)", "pytest (>=8.3.2)", "ruff (>=0.6.2)"]
+
+[[package]]
+name = "iniconfig"
+version = "2.0.0"
+description = "brain-dead simple config-ini parsing"
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "iniconfig-2.0.0-py3-none-any.whl", hash = "sha256:b6a85871a79d2e3b22d2d1b94ac2824226a63c6b741c88f7ae975f18b6778374"},
+    {file = "iniconfig-2.0.0.tar.gz", hash = "sha256:2d91e135bf72d31a410b17c16da610a82cb55f6b0477d1a902134b24a455b8b3"},
+]
+
+[[package]]
+name = "orderly-set"
+version = "5.2.3"
+description = "Orderly set"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "orderly_set-5.2.3-py3-none-any.whl", hash = "sha256:d357cedcf67f4ebff0d4cbd5b0997e98eeb65dd24fdf5c990a501ae9e82c7d34"},
+    {file = "orderly_set-5.2.3.tar.gz", hash = "sha256:571ed97c5a5fca7ddeb6b2d26c19aca896b0ed91f334d9c109edd2f265fb3017"},
+]
+
+[[package]]
+name = "packaging"
+version = "24.2"
+description = "Core utilities for Python packages"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "packaging-24.2-py3-none-any.whl", hash = "sha256:09abb1bccd265c01f4a3aa3f7a7db064b36514d2cba19a2f694fe6150451a759"},
+    {file = "packaging-24.2.tar.gz", hash = "sha256:c228a6dc5e932d346bc5739379109d49e8853dd8223571c7c5b55260edc0b97f"},
+]
+
+[[package]]
+name = "pluggy"
+version = "1.5.0"
+description = "plugin and hook calling mechanisms for python"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "pluggy-1.5.0-py3-none-any.whl", hash = "sha256:44e1ad92c8ca002de6377e165f3e0f1be63266ab4d554740532335b9d75ea669"},
+    {file = "pluggy-1.5.0.tar.gz", hash = "sha256:2cffa88e94fdc978c4c574f15f9e59b7f4201d439195c3715ca9e2486f1d0cf1"},
+]
+
+[package.extras]
+dev = ["pre-commit", "tox"]
+testing = ["pytest", "pytest-benchmark"]
+
+[[package]]
+name = "pytest"
+version = "8.3.4"
+description = "pytest: simple powerful testing with Python"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "pytest-8.3.4-py3-none-any.whl", hash = "sha256:50e16d954148559c9a74109af1eaf0c945ba2d8f30f0a3d3335edde19788b6f6"},
+    {file = "pytest-8.3.4.tar.gz", hash = "sha256:965370d062bce11e73868e0335abac31b4d3de0e82f4007408d242b4f8610761"},
+]
+
+[package.dependencies]
+colorama = {version = "*", markers = "sys_platform == \"win32\""}
+iniconfig = "*"
+packaging = "*"
+pluggy = ">=1.5,<2"
+
+[package.extras]
+dev = ["argcomplete", "attrs (>=19.2)", "hypothesis (>=3.56)", "mock", "pygments (>=2.7.2)", "requests", "setuptools", "xmlschema"]
+
+[[package]]
+name = "pytest-cov"
+version = "4.1.0"
+description = "Pytest plugin for measuring coverage."
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "pytest-cov-4.1.0.tar.gz", hash = "sha256:3904b13dfbfec47f003b8e77fd5b589cd11904a21ddf1ab38a64f204d6a10ef6"},
+    {file = "pytest_cov-4.1.0-py3-none-any.whl", hash = "sha256:6ba70b9e97e69fcc3fb45bfeab2d0a138fb65c4d0d6a41ef33983ad114be8c3a"},
+]
+
+[package.dependencies]
+coverage = {version = ">=5.2.1", extras = ["toml"]}
+pytest = ">=4.6"
+
+[package.extras]
+testing = ["fields", "hunter", "process-tests", "pytest-xdist", "six", "virtualenv"]
+
+[[package]]
+name = "pytest-retry"
+version = "1.6.3"
+description = "Adds the ability to retry flaky tests in CI environments"
+optional = false
+python-versions = ">=3.9"
+files = [
+    {file = "pytest_retry-1.6.3-py3-none-any.whl", hash = "sha256:e96f7df77ee70b0838d1085f9c3b8b5b7d74bf8947a0baf32e2b8c71b27683c8"},
+    {file = "pytest_retry-1.6.3.tar.gz", hash = "sha256:36ccfa11c8c8f9ddad5e20375182146d040c20c4a791745139c5a99ddf1b557d"},
+]
+
+[package.dependencies]
+pytest = ">=7.0.0"
+
+[package.extras]
+dev = ["black", "flake8", "isort", "mypy"]
+
+[[package]]
+name = "pytest-sugar"
+version = "1.0.0"
+description = "pytest-sugar is a plugin for pytest that changes the default look and feel of pytest (e.g. progressbar, show tests that fail instantly)."
+optional = false
+python-versions = "*"
+files = [
+    {file = "pytest-sugar-1.0.0.tar.gz", hash = "sha256:6422e83258f5b0c04ce7c632176c7732cab5fdb909cb39cca5c9139f81276c0a"},
+    {file = "pytest_sugar-1.0.0-py3-none-any.whl", hash = "sha256:70ebcd8fc5795dc457ff8b69d266a4e2e8a74ae0c3edc749381c64b5246c8dfd"},
+]
+
+[package.dependencies]
+packaging = ">=21.3"
+pytest = ">=6.2.0"
+termcolor = ">=2.1.0"
+
+[package.extras]
+dev = ["black", "flake8", "pre-commit"]
+
+[[package]]
+name = "pyyaml"
+version = "6.0.2"
+description = "YAML parser and emitter for Python"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "PyYAML-6.0.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:0a9a2848a5b7feac301353437eb7d5957887edbf81d56e903999a75a3d743086"},
+    {file = "PyYAML-6.0.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:29717114e51c84ddfba879543fb232a6ed60086602313ca38cce623c1d62cfbf"},
+    {file = "PyYAML-6.0.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8824b5a04a04a047e72eea5cec3bc266db09e35de6bdfe34c9436ac5ee27d237"},
+    {file = "PyYAML-6.0.2-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7c36280e6fb8385e520936c3cb3b8042851904eba0e58d277dca80a5cfed590b"},
+    {file = "PyYAML-6.0.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ec031d5d2feb36d1d1a24380e4db6d43695f3748343d99434e6f5f9156aaa2ed"},
+    {file = "PyYAML-6.0.2-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:936d68689298c36b53b29f23c6dbb74de12b4ac12ca6cfe0e047bedceea56180"},
+    {file = "PyYAML-6.0.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:23502f431948090f597378482b4812b0caae32c22213aecf3b55325e049a6c68"},
+    {file = "PyYAML-6.0.2-cp310-cp310-win32.whl", hash = "sha256:2e99c6826ffa974fe6e27cdb5ed0021786b03fc98e5ee3c5bfe1fd5015f42b99"},
+    {file = "PyYAML-6.0.2-cp310-cp310-win_amd64.whl", hash = "sha256:a4d3091415f010369ae4ed1fc6b79def9416358877534caf6a0fdd2146c87a3e"},
+    {file = "PyYAML-6.0.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:cc1c1159b3d456576af7a3e4d1ba7e6924cb39de8f67111c735f6fc832082774"},
+    {file = "PyYAML-6.0.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:1e2120ef853f59c7419231f3bf4e7021f1b936f6ebd222406c3b60212205d2ee"},
+    {file = "PyYAML-6.0.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5d225db5a45f21e78dd9358e58a98702a0302f2659a3c6cd320564b75b86f47c"},
+    {file = "PyYAML-6.0.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5ac9328ec4831237bec75defaf839f7d4564be1e6b25ac710bd1a96321cc8317"},
+    {file = "PyYAML-6.0.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3ad2a3decf9aaba3d29c8f537ac4b243e36bef957511b4766cb0057d32b0be85"},
+    {file = "PyYAML-6.0.2-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:ff3824dc5261f50c9b0dfb3be22b4567a6f938ccce4587b38952d85fd9e9afe4"},
+    {file = "PyYAML-6.0.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:797b4f722ffa07cc8d62053e4cff1486fa6dc094105d13fea7b1de7d8bf71c9e"},
+    {file = "PyYAML-6.0.2-cp311-cp311-win32.whl", hash = "sha256:11d8f3dd2b9c1207dcaf2ee0bbbfd5991f571186ec9cc78427ba5bd32afae4b5"},
+    {file = "PyYAML-6.0.2-cp311-cp311-win_amd64.whl", hash = "sha256:e10ce637b18caea04431ce14fabcf5c64a1c61ec9c56b071a4b7ca131ca52d44"},
+    {file = "PyYAML-6.0.2-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:c70c95198c015b85feafc136515252a261a84561b7b1d51e3384e0655ddf25ab"},
+    {file = "PyYAML-6.0.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:ce826d6ef20b1bc864f0a68340c8b3287705cae2f8b4b1d932177dcc76721725"},
+    {file = "PyYAML-6.0.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1f71ea527786de97d1a0cc0eacd1defc0985dcf6b3f17bb77dcfc8c34bec4dc5"},
+    {file = "PyYAML-6.0.2-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9b22676e8097e9e22e36d6b7bda33190d0d400f345f23d4065d48f4ca7ae0425"},
+    {file = "PyYAML-6.0.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:80bab7bfc629882493af4aa31a4cfa43a4c57c83813253626916b8c7ada83476"},
+    {file = "PyYAML-6.0.2-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:0833f8694549e586547b576dcfaba4a6b55b9e96098b36cdc7ebefe667dfed48"},
+    {file = "PyYAML-6.0.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8b9c7197f7cb2738065c481a0461e50ad02f18c78cd75775628afb4d7137fb3b"},
+    {file = "PyYAML-6.0.2-cp312-cp312-win32.whl", hash = "sha256:ef6107725bd54b262d6dedcc2af448a266975032bc85ef0172c5f059da6325b4"},
+    {file = "PyYAML-6.0.2-cp312-cp312-win_amd64.whl", hash = "sha256:7e7401d0de89a9a855c839bc697c079a4af81cf878373abd7dc625847d25cbd8"},
+    {file = "PyYAML-6.0.2-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:efdca5630322a10774e8e98e1af481aad470dd62c3170801852d752aa7a783ba"},
+    {file = "PyYAML-6.0.2-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:50187695423ffe49e2deacb8cd10510bc361faac997de9efef88badc3bb9e2d1"},
+    {file = "PyYAML-6.0.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0ffe8360bab4910ef1b9e87fb812d8bc0a308b0d0eef8c8f44e0254ab3b07133"},
+    {file = "PyYAML-6.0.2-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:17e311b6c678207928d649faa7cb0d7b4c26a0ba73d41e99c4fff6b6c3276484"},
+    {file = "PyYAML-6.0.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:70b189594dbe54f75ab3a1acec5f1e3faa7e8cf2f1e08d9b561cb41b845f69d5"},
+    {file = "PyYAML-6.0.2-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:41e4e3953a79407c794916fa277a82531dd93aad34e29c2a514c2c0c5fe971cc"},
+    {file = "PyYAML-6.0.2-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:68ccc6023a3400877818152ad9a1033e3db8625d899c72eacb5a668902e4d652"},
+    {file = "PyYAML-6.0.2-cp313-cp313-win32.whl", hash = "sha256:bc2fa7c6b47d6bc618dd7fb02ef6fdedb1090ec036abab80d4681424b84c1183"},
+    {file = "PyYAML-6.0.2-cp313-cp313-win_amd64.whl", hash = "sha256:8388ee1976c416731879ac16da0aff3f63b286ffdd57cdeb95f3f2e085687563"},
+    {file = "PyYAML-6.0.2-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:24471b829b3bf607e04e88d79542a9d48bb037c2267d7927a874e6c205ca7e9a"},
+    {file = "PyYAML-6.0.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d7fded462629cfa4b685c5416b949ebad6cec74af5e2d42905d41e257e0869f5"},
+    {file = "PyYAML-6.0.2-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d84a1718ee396f54f3a086ea0a66d8e552b2ab2017ef8b420e92edbc841c352d"},
+    {file = "PyYAML-6.0.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9056c1ecd25795207ad294bcf39f2db3d845767be0ea6e6a34d856f006006083"},
+    {file = "PyYAML-6.0.2-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:82d09873e40955485746739bcb8b4586983670466c23382c19cffecbf1fd8706"},
+    {file = "PyYAML-6.0.2-cp38-cp38-win32.whl", hash = "sha256:43fa96a3ca0d6b1812e01ced1044a003533c47f6ee8aca31724f78e93ccc089a"},
+    {file = "PyYAML-6.0.2-cp38-cp38-win_amd64.whl", hash = "sha256:01179a4a8559ab5de078078f37e5c1a30d76bb88519906844fd7bdea1b7729ff"},
+    {file = "PyYAML-6.0.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:688ba32a1cffef67fd2e9398a2efebaea461578b0923624778664cc1c914db5d"},
+    {file = "PyYAML-6.0.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:a8786accb172bd8afb8be14490a16625cbc387036876ab6ba70912730faf8e1f"},
+    {file = "PyYAML-6.0.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d8e03406cac8513435335dbab54c0d385e4a49e4945d2909a581c83647ca0290"},
+    {file = "PyYAML-6.0.2-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f753120cb8181e736c57ef7636e83f31b9c0d1722c516f7e86cf15b7aa57ff12"},
+    {file = "PyYAML-6.0.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3b1fdb9dc17f5a7677423d508ab4f243a726dea51fa5e70992e59a7411c89d19"},
+    {file = "PyYAML-6.0.2-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:0b69e4ce7a131fe56b7e4d770c67429700908fc0752af059838b1cfb41960e4e"},
+    {file = "PyYAML-6.0.2-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:a9f8c2e67970f13b16084e04f134610fd1d374bf477b17ec1599185cf611d725"},
+    {file = "PyYAML-6.0.2-cp39-cp39-win32.whl", hash = "sha256:6395c297d42274772abc367baaa79683958044e5d3835486c16da75d2a694631"},
+    {file = "PyYAML-6.0.2-cp39-cp39-win_amd64.whl", hash = "sha256:39693e1f8320ae4f43943590b49779ffb98acb81f788220ea932a6b6c51004d8"},
+    {file = "pyyaml-6.0.2.tar.gz", hash = "sha256:d584d9ec91ad65861cc08d42e834324ef890a082e591037abe114850ff7bbc3e"},
+]
+
+[[package]]
+name = "requests"
+version = "2.32.3"
+description = "Python HTTP for Humans."
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "requests-2.32.3-py3-none-any.whl", hash = "sha256:70761cfe03c773ceb22aa2f671b4757976145175cdfca038c02654d061d6dcc6"},
+    {file = "requests-2.32.3.tar.gz", hash = "sha256:55365417734eb18255590a9ff9eb97e9e1da868d4ccd6402399eaf68af20a760"},
+]
+
+[package.dependencies]
+certifi = ">=2017.4.17"
+charset-normalizer = ">=2,<4"
+idna = ">=2.5,<4"
+urllib3 = ">=1.21.1,<3"
+
+[package.extras]
+socks = ["PySocks (>=1.5.6,!=1.5.7)"]
+use-chardet-on-py3 = ["chardet (>=3.0.2,<6)"]
+
+[[package]]
+name = "termcolor"
+version = "2.5.0"
+description = "ANSI color formatting for output in terminal"
+optional = false
+python-versions = ">=3.9"
+files = [
+    {file = "termcolor-2.5.0-py3-none-any.whl", hash = "sha256:37b17b5fc1e604945c2642c872a3764b5d547a48009871aea3edd3afa180afb8"},
+    {file = "termcolor-2.5.0.tar.gz", hash = "sha256:998d8d27da6d48442e8e1f016119076b690d962507531df4890fcd2db2ef8a6f"},
+]
+
+[package.extras]
+tests = ["pytest", "pytest-cov"]
+
+[[package]]
+name = "urllib3"
+version = "2.2.3"
+description = "HTTP library with thread-safe connection pooling, file post, and more."
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "urllib3-2.2.3-py3-none-any.whl", hash = "sha256:ca899ca043dcb1bafa3e262d73aa25c465bfb49e0bd9dd5d59f1d0acba2f8fac"},
+    {file = "urllib3-2.2.3.tar.gz", hash = "sha256:e7d814a81dad81e6caf2ec9fdedb284ecc9c73076b62654547cc64ccdcae26e9"},
+]
+
+[package.extras]
+brotli = ["brotli (>=1.0.9)", "brotlicffi (>=0.8.0)"]
+h2 = ["h2 (>=4,<5)"]
+socks = ["pysocks (>=1.5.6,!=1.5.7,<2.0)"]
+zstd = ["zstandard (>=0.18.0)"]
+
+[metadata]
+lock-version = "2.0"
+python-versions = "^3.12"
+content-hash = "be71fbf338d83dd6170e7d9df2df8d1035a07397fe5ffc522c96a92cfe3318bd"
--- a/demos/test_runner/pyproject.toml
+++ b/demos/test_runner/pyproject.toml
@ -0,0 +1,27 @@
+[tool.poetry]
+name = "demo tests"
+version = "0.0.1"
+description = "demo tests runner"
+authors = ["Katanemo Labs, Inc <info@katanemo.com>"]
+license = "Apache 2.0"
+readme = "README.md"
+package-mode = false
+
+[tool.poetry.dependencies]
+python = "^3.12"
+pytest = "^8.3.3"
+requests = "^2.29.0"
+pytest-sugar = "^1.0.0"
+deepdiff = "^8.0.1"
+pytest-retry = "^1.6.3"
+pyyaml = "*"
+
+[tool.poetry.dev-dependencies]
+pytest-cov = "^4.1.0"
+
+[tool.pytest.ini_options]
+python_files = ["test*.py"]
+addopts = ["-v", "-s"]
+retries = 2
+retry_delay = 0.5
+cumulative_timing = false
--- a/demos/test_runner/run_demo_tests.sh
+++ b/demos/test_runner/run_demo_tests.sh
@ -0,0 +1,19 @@
+#!/bin/bash
+set -eu
+
+# for demo in currency_exchange hr_agent
+for demo in currency_exchange
+do
+  echo "******************************************"
+  echo "Running tests for $demo ..."
+  echo "****************************************"
+  cd ../$demo
+  archgw up arch_config.yaml
+  docker compose up -d
+  cd ../test_runner
+  TEST_DATA=../$demo/test_data.yaml poetry run pytest
+  cd ../$demo
+  archgw down
+  docker compose down -v
+  cd ../test_runner
+done
--- a/demos/test_runner/test_demos.py
+++ b/demos/test_runner/test_demos.py
@ -0,0 +1,60 @@
+import json
+import os
+from common import get_arch_messages
+import pytest
+import requests
+from deepdiff import DeepDiff
+import logging
+import yaml
+
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.DEBUG)
+
+ARCHGW_ENDPOINT = os.getenv(
+    "ARCHGW_ENDPOINT", "http://localhost:10000/v1/chat/completions"
+)
+
+# Load test data from YAML file
+with open(os.getenv("TEST_DATA", "test_data.yaml"), "r") as file:
+    test_data_yaml = yaml.safe_load(file)
+
+
+@pytest.mark.parametrize(
+    "test_data",
+    [
+        pytest.param(test_case, id=test_case["id"])
+        for test_case in test_data_yaml["test_cases"]
+    ],
+)
+def test_demos(test_data):
+    input = test_data["input"]
+    expected_tools = test_data["expected_tools"]
+    expected_output_contains = test_data["expected_output_contains"]
+
+    response = requests.post(ARCHGW_ENDPOINT, json=input)
+    assert response.status_code == 200
+    # ensure that response is json
+    assert response.headers["content-type"] == "application/json"
+
+    response_json = response.json()
+    assert response_json.get("model").startswith("gpt-4o")
+    choices = response_json.get("choices", [])
+    assert len(choices) > 0
+
+    # ensure that model responded according to the expectation
+    assert "role" in choices[0]["message"]
+    assert choices[0]["message"]["role"] == "assistant"
+    assert expected_output_contains.lower() in choices[0]["message"]["content"].lower()
+
+    # now verify arch_messages (tool call and api response) that are sent as response metadata
+    arch_messages = get_arch_messages(response_json)
+    assert len(arch_messages) == 2
+    tool_calls_message = arch_messages[0]
+    tool_calls = tool_calls_message.get("tool_calls", [])
+    assert len(tool_calls) > 0
+
+    # remove dynamic id from tool_calls
+    for tool_call in tool_calls:
+        tool_call.pop("id", None)
+    diff = DeepDiff(expected_tools, tool_calls, ignore_string_case=True)
+    assert not diff
--- a/demos/weather_forecast/arch_config.yaml
+++ b/demos/weather_forecast/arch_config.yaml
@ -42,21 +42,18 @@ prompt_guards:
        message: Looks like you're curious about my abilities, but I can only provide assistance for weather forecasting.

 prompt_targets:
-  - name: weather_forecast
-    description: Check weather information for a given city.
+  - name: get_current_weather
+    description: Get current weather at a location.
    parameters:
-      - name: city
-        description: the name of the city
+      - name: location
+        description: The location to get the weather for
        required: true
-        type: str
+        type: string
+        format: City, State
      - name: days
-        description: the number of days
-        type: int
+        description: the number of days for the request
        required: true
-      - name: units
-        description: the temperature unit, e.g., Celsius and Fahrenheit
-        type: str
-        default: Fahrenheit
+        type: int
    endpoint:
      name: weather_forecast_service
      path: /weather
--- a/demos/weather_forecast/docker-compose-logfire.yaml
+++ b/demos/weather_forecast/docker-compose-logfire.yaml
@ -1,7 +1,7 @@
 services:
  weather_forecast_service:
    build:
-      context: ./weather_forecast_service
+      context: ./
    environment:
      - OLTP_HOST=http://otel-collector:4317
    extra_hosts:
--- a/demos/weather_forecast/docker-compose-signoz.yaml
+++ b/demos/weather_forecast/docker-compose-signoz.yaml
@ -4,7 +4,7 @@ include:
 services:
  weather_forecast_service:
    build:
-      context: ./weather_forecast_service
+      context: .
    environment:
      - OLTP_HOST=http://otel-collector:4317
    extra_hosts:
--- a/demos/weather_forecast/main.py
+++ b/demos/weather_forecast/main.py
@ -42,7 +42,7 @@ async def healthz():


 class WeatherRequest(BaseModel):
-    city: str
+    location: str
    days: int = 7
    units: str = "Farenheit"

@ -50,7 +50,7 @@ class WeatherRequest(BaseModel):
@app.post("/weather")
 async def weather(req: WeatherRequest, res: Response):
    weather_forecast = {
-        "city": req.city,
+        "location": req.location,
        "temperature": [],
        "units": req.units,
    }
--- a/demos/weather_forecast/pyproject.toml
+++ b/demos/weather_forecast/pyproject.toml
@ -2,7 +2,7 @@
 name = "api-server"
 version = "0.1.0"
 description = ""
-authors = ["Adil Hafeez <adil@katanemo.com>"]
+authors = ["Adil Hafeez <info@katanemo.com>"]
 readme = "README.md"

 [tool.poetry.dependencies]
--- a/demos/weather_forecast/run_demo.sh
+++ b/demos/weather_forecast/run_demo.sh
@ -56,7 +56,7 @@ start_demo() {

  load_env

-  if [ "$1" == "logfire" ] && [ -z "$LOGFIRE_API_KEY"]; then
+  if [ "$1" == "logfire" ] && [ -z "$LOGFIRE_API_KEY" ]; then
    echo "Error: LOGFIRE_API_KEY environment variable is required for Logfire."
    exit 1
  fi
--- a/docs/source/_static/img/input-token-metrics.png
+++ b/docs/source/_static/img/input-token-metrics.png
--- a/docs/source/_static/img/llm-request-metrics.png
+++ b/docs/source/_static/img/llm-request-metrics.png
--- a/docs/source/_static/img/output-token-metrics.png
+++ b/docs/source/_static/img/output-token-metrics.png
--- a/docs/source/build_with_arch/agent.rst
+++ b/docs/source/build_with_arch/agent.rst
@ -1,18 +1,18 @@
 .. _arch_agent_guide:

-Agentic Workflow
-==============================
+Agentic Apps
+=============

-Arch helps you easily personalize your applications by calling application-specific (API) functions
-via user prompts. This involves any predefined functions or APIs you want to expose to users to perform tasks,
-gather information, or manipulate data. This capability is generally referred to as :ref:`function calling <function_calling>`, where
-you have the flexibility to support “agentic” apps tailored to specific use cases - from updating insurance
-claims to creating ad campaigns - via prompts.
+Arch helps you build personalized agentic applications by calling application-specific (API) functions via user prompts.
+This involves any predefined functions or APIs you want to expose to users to perform tasks, gather information,
+or manipulate data. This capability is generally referred to as :ref:`function calling <function_calling>`, where
+you can support “agentic” apps tailored to specific use cases - from updating insurance claims to creating ad campaigns - via prompts.

-Arch analyzes prompts, extracts critical information from prompts, engages in lightweight conversation with
-the user to gather any missing parameters and makes API calls so that you can focus on writing business logic.
-Arch does this via its purpose-built `Arch-Function <https://huggingface.co/collections/katanemo/arch-function-66f209a693ea8df14317ad68>`_ - the fastest (200ms p90 - 10x faser than GPT-4o)
-and cheapest (100x than GPT-4o) function calling LLM that matches performance with frontier models.
+Arch analyzes prompts, extracts critical information from prompts, engages in lightweight conversation with the user to
+gather any missing parameters and makes API calls so that you can focus on writing business logic. Arch does this via its
+purpose-built `Arch-Function <https://huggingface.co/collections/katanemo/arch-function-66f209a693ea8df14317ad68>`_ -
+the fastest (200ms p50 - 12x faser than GPT-4o) and cheapest (44x than GPT-4o) function calling LLM that matches or outperforms
+frontier LLMs.

 .. image:: includes/agent/function-calling-flow.jpg
   :width: 100%
--- a/docs/source/build_with_arch/includes/multi_turn/multi_turn_rag.py
+++ b/docs/source/build_with_arch/includes/multi_turn/multi_turn_rag.py
@ -0,0 +1,39 @@
+import os
+import gradio as gr
+
+from fastapi import FastAPI, HTTPException
+from pydantic import BaseModel
+from typing import Optional
+from openai import OpenAI
+from common import create_gradio_app
+
+app = FastAPI()
+
+
+# Define the request model
+class EnergySourceRequest(BaseModel):
+    energy_source: str
+    consideration: Optional[str] = None
+
+
+class EnergySourceResponse(BaseModel):
+    energy_source: str
+    consideration: Optional[str] = None
+
+
+# Post method for device summary
+@app.post("/agent/energy_source_info")
+def get_workforce(request: EnergySourceRequest):
+    """
+    Endpoint to get details about energy source
+    """
+    considertion = "You don't have any specific consideration. Feel free to talk in a more open ended fashion"
+
+    if request.consideration is not None:
+        considertion = f"Add specific focus on the following consideration when you summarize the content for the energy source: {request.consideration}"
+
+    response = {
+        "energy_source": request.energy_source,
+        "consideration": considertion,
+    }
+    return response
--- a/docs/source/build_with_arch/includes/multi_turn/mutli-turn-example.png
+++ b/docs/source/build_with_arch/includes/multi_turn/mutli-turn-example.png
--- a/docs/source/build_with_arch/includes/multi_turn/prompt_targets_multi_turn.yaml
+++ b/docs/source/build_with_arch/includes/multi_turn/prompt_targets_multi_turn.yaml
@ -0,0 +1,35 @@
+version: v0.1
+listener:
+  address: 127.0.0.1
+  port: 8080 #If you configure port 443, you'll need to update the listener with tls_certificates
+  message_format: huggingface
+
+# Centralized way to manage LLMs, manage keys, retry logic, failover and limits in a central way
+llm_providers:
+  - name: OpenAI
+    provider: openai
+    access_key: $OPENAI_API_KEY
+    model: gpt-3.5-turbo
+    default: true
+
+# default system prompt used by all prompt targets
+system_prompt: |
+   You are a helpful assistant and can offer information about energy sources. You will get a JSON object with energy_source and consideration fields. Focus on answering using those fields
+
+prompt_targets:
+  - name: get_info_for_energy_source
+    description: get information about an energy source
+    parameters:
+      - name: energy_source
+        type: str
+        description: a source of energy
+        required: true
+        enum: [renewable, fossil]
+      - name: consideration
+        type: str
+        description: a specific type of consideration for an energy source
+        enum: [cost, economic, technology]
+    endpoint:
+      name: rag_energy_source_agent
+      path: /agent/energy_source_info
+      http_method: POST
--- a/docs/source/build_with_arch/includes/rag/intent_detection.py
+++ b/docs/source/build_with_arch/includes/rag/intent_detection.py
@ -1,162 +0,0 @@
-from flask import Flask, request, jsonify
-from datetime import datetime
-import uuid
-from langchain.memory import ConversationBufferMemory
-from langchain.schema import AIMessage, HumanMessage
-from langchain import OpenAI
-
-app = Flask(__name__)
-
-# Global dictionary to keep track of user memories
-user_memories = {}
-
-
-def get_user_conversation(user_id):
-    """
-    Retrieve the user's conversation memory using LangChain.
-    If the user does not exist, initialize their conversation memory.
-    """
-    if user_id not in user_memories:
-        user_memories[user_id] = ConversationBufferMemory(return_messages=True)
-    return user_memories[user_id]
-
-
-def update_user_conversation(user_id, client_messages, intent_changed):
-    """
-    Update the user's conversation memory with new messages using LangChain.
-    Each message is augmented with a UUID, timestamp, and intent change marker.
-    Only new messages are added to avoid duplication.
-    """
-    memory = get_user_conversation(user_id)
-    stored_messages = memory.chat_memory.messages
-
-    # Determine the number of stored messages
-    num_stored_messages = len(stored_messages)
-    new_messages = client_messages[num_stored_messages:]
-
-    # Process each new message
-    for index, message in enumerate(new_messages):
-        role = message.get("role")
-        content = message.get("content")
-        metadata = {
-            "uuid": str(uuid.uuid4()),
-            "timestamp": datetime.utcnow().isoformat(),
-            "intent_changed": False,  # Default value
-        }
-
-        # Mark the intent change on the last message if detected
-        if intent_changed and index == len(new_messages) - 1:
-            metadata["intent_changed"] = True
-
-        # Create a new message with metadata
-        if role == "user":
-            memory.chat_memory.add_message(
-                HumanMessage(content=content, additional_kwargs={"metadata": metadata})
-            )
-        elif role == "assistant":
-            memory.chat_memory.add_message(
-                AIMessage(content=content, additional_kwargs={"metadata": metadata})
-            )
-        else:
-            # Handle other roles if necessary
-            pass
-
-    return memory
-
-
-def get_messages_since_last_intent(messages):
-    """
-    Retrieve messages from the last intent change onwards using LangChain.
-    """
-    messages_since_intent = []
-    for message in reversed(messages):
-        # Insert message at the beginning to maintain correct order
-        messages_since_intent.insert(0, message)
-        metadata = message.additional_kwargs.get("metadata", {})
-        # Break if intent_changed is True
-        if metadata.get("intent_changed", False) == True:
-            break
-
-    return messages_since_intent
-
-
-def forward_to_llm(messages):
-    """
-    Forward messages to an upstream LLM using LangChain.
-    """
-    # Convert messages to a conversation string
-    conversation = ""
-    for message in messages:
-        role = "User" if isinstance(message, HumanMessage) else "Assistant"
-        content = message.content
-        conversation += f"{role}: {content}\n"
-    # Use LangChain's LLM to get a response. This call is proxied through Arch for end-to-end observability and traffic management
-    llm = OpenAI()
-    # Create a prompt that includes the conversation
-    prompt = f"{conversation}Assistant:"
-    response = llm(prompt)
-    return response
-
-
-@app.route("/process_rag", methods=["POST"])
-def process_rag():
-    # Extract JSON data from the request
-    data = request.get_json()
-
-    user_id = data.get("user_id")
-    if not user_id:
-        return jsonify({"error": "User ID is required"}), 400
-
-    client_messages = data.get("messages")
-    if not client_messages or not isinstance(client_messages, list):
-        return jsonify({"error": "Messages array is required"}), 400
-
-    # Extract the intent change marker from Arch's headers if present for the current prompt
-    intent_changed_header = request.headers.get("x-arch-intent-marker", "").lower()
-    if intent_changed_header in ["", "false"]:
-        intent_changed = False
-    elif intent_changed_header == "true":
-        intent_changed = True
-    else:
-        # Invalid value provided
-        return (
-            jsonify({"error": "Invalid value for x-arch-prompt-intent-change header"}),
-            400,
-        )
-
-    # Update user conversation based on intent change
-    memory = update_user_conversation(user_id, client_messages, intent_changed)
-
-    # Retrieve messages since last intent change for LLM
-    messages_for_llm = get_messages_since_last_intent(memory.chat_memory.messages)
-
-    # Forward messages to upstream LLM
-    llm_response = forward_to_llm(messages_for_llm)
-
-    # Prepare the messages to return
-    messages_to_return = []
-    for message in memory.chat_memory.messages:
-        role = "user" if isinstance(message, HumanMessage) else "assistant"
-        content = message.content
-        metadata = message.additional_kwargs.get("metadata", {})
-        message_entry = {
-            "uuid": metadata.get("uuid"),
-            "timestamp": metadata.get("timestamp"),
-            "role": role,
-            "content": content,
-            "intent_changed": metadata.get("intent_changed", False),
-        }
-        messages_to_return.append(message_entry)
-
-    # Prepare the response
-    response = {
-        "user_id": user_id,
-        "messages": messages_to_return,
-        "llm_response": llm_response,
-    }
-
-    return jsonify(response), 200
-
-
-if __name__ == "__main__":
-    app.run(debug=True)
--- a/docs/source/build_with_arch/multi_turn.rst
+++ b/docs/source/build_with_arch/multi_turn.rst
@ -0,0 +1,90 @@
+.. _arch_multi_turn_guide:
+
+Multi-Turn
+==========
+Developers often `struggle <https://www.reddit.com/r/LocalLLaMA/comments/18mqwg6/best_practice_for_rag_with_followup_chat/>`_ to efficiently handle
+``follow-up`` or ``clarification`` questions. Specifically, when users ask for changes or additions to previous responses, it requires developers to
+re-write prompts using LLMs with precise prompt engineering techniques. This process is slow, manual, error prone and adds latency and token cost for
+common scenarios that can be managed more efficiently.
+
+Arch is highly capable of accurately detecting and processing prompts in multi-turn scenarios so that you can buil fast and accurate agents in minutes.
+Below are some cnversational examples that you can build via Arch. Each example is enriched with annotations (via ** [Arch] ** ) that illustrates how Arch
+processess conversational messages on your behalf.
+
+.. Note::
+    The following section assumes that you have some knowledge about the core concepts of Arch, such as :ref:`prompt_targets <arch_overview_prompt_handling>`.
+    If you haven't familizaried yourself with Arch's concepts, we recommend you first read the :ref:`tech overview <tech_overview>` section firtst.
+    Additionally, the conversation examples below assume the usage of the following :ref:`arch_config.yaml <multi_turn_subsection_prompt_target>` file.
+
+Example 1: Adjusting Retrieval
+------------------------------
+.. code-block:: text
+
+    User: What are the benefits of renewable energy?
+    **[Arch]**: Check if there is an available <prompt_target> that can handle this user query.
+    **[Arch]**: Found "get_info_for_energy_source" prompt_target in arch_config.yaml. Forward prompt to the endpoint configured in "get_info_for_energy_source"
+    ...
+    Assistant: Renewable energy reduces greenhouse gas emissions, lowers air pollution, and provides sustainable power sources like solar and wind.
+
+    User: Include cost considerations in the response.
+    **[Arch]**: Follow-up detected. Forward prompt history to the "get_info_for_energy_source" prompt_target and post the following parameters consideration="cost"
+    ...
+    Assistant: Renewable energy reduces greenhouse gas emissions, lowers air pollution, and provides sustainable power sources like solar and wind. While the initial setup costs can be high, long-term savings from reduced fuel expenses and government incentives make it cost-effective.
+
+
+Example 2: Switching Intent
+---------------------------
+.. code-block:: text
+
+    User: What are the symptoms of diabetes?
+    **[Arch]**: Check if there is an available <prompt_target> that can handle this user query.
+    **[Arch]**: Found "diseases_symptoms" prompt_target in arch_config.yaml. Forward disease=diabeteres to "diseases_symptoms" prompt target
+    ...
+    Assistant: Common symptoms include frequent urination, excessive thirst, fatigue, and blurry vision.
+
+    User: How is it diagnosed?
+    **[Arch]**: New intent detected.
+    **[Arch]**: Found "disease_diagnoses" prompt_target in arch_config.yaml. Forward disease=diabeteres to "disease_diagnoses" prompt target
+    ...
+    Assistant: Diabetes is diagnosed through blood tests like fasting blood sugar, A1C, or an oral glucose tolerance test.
+
+
+Build Multi-Turn RAG Apps
+--------------------------
+The following section describes how you can easilly add support for multi-turn scenarios via Arch. You process and manage multi-turn prompts
+just like you manage single-turn ones. Arch handles the conpleixity of detecting the correct intent based on the last user prompt and
+the covnersational history, extracts relevant parameters needed by downstream APIs, and dipatches calls to any upstream LLMs to summarize the
+response from your APIs.
+
+
+.. _multi_turn_subsection_prompt_target:
+
+Step 1: Define Arch Config
+~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. literalinclude:: includes/multi_turn/prompt_targets_multi_turn.yaml
+    :language: yaml
+    :caption: Arch Config
+    :linenos:
+
+Step 2: Process Request in Flask
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Once the prompt targets are configured as above, handle parameters across multi-turn as if its a single-turn request
+
+.. literalinclude:: includes/multi_turn/multi_turn_rag.py
+    :language: python
+    :caption: Parameter handling with Flask
+    :linenos:
+
+Demo App
+~~~~~~~~
+
+For your convenience, we've built a `demo app <https://github.com/katanemo/archgw/main/demos/multi_turn_rag_agent>`_
+that you can test and modify locally for multi-turn RAG scenarios.
+
+.. figure:: includes/multi_turn/mutli-turn-example.png
+   :width: 100%
+   :align: center
+
+   Example multi-turn user conversation showing adjusting retrieval
--- a/docs/source/build_with_arch/rag.rst
+++ b/docs/source/build_with_arch/rag.rst
@ -1,10 +1,18 @@
 .. _arch_rag_guide:

-RAG Application
-===============
+RAG Apps
+========

 The following section describes how Arch can help you build faster, smarter and more accurate
-Retrieval-Augmented Generation (RAG) applications.
+Retrieval-Augmented Generation (RAG) applications, including fast and accurate RAG in multi-turn
+converational scenarios.
+
+What is Retrieval-Augmented Generation (RAG)?
+---------------------------------------------
+RAG applications combine retrieval-based methods with generative AI models to provide more accurate,
+contextually relevant, and reliable outputs. These applications leverage external data sources to augment
+the capabilities of Large Language Models (LLMs), enabling them to retrieve and integrate specific information
+rather than relying solely on the LLM's internal knowledge.

 Parameter Extraction for RAG
 ----------------------------
@ -33,60 +41,12 @@ Once the prompt targets are configured as above, handling those parameters is
    :caption: Parameter handling with Flask
    :linenos:

-[Coming Soon] `Drift Detection via Arch Intent-Markers <https://github.com/orgs/katanemo/projects/1/views/1?pane=issue&itemId=82697909>`_
-----------------------------------------------------------------------------------------------------------------------------------------
-Developers struggle to efficiently handle ``follow-up`` or ``clarification`` questions. Specifically, when users ask for
-changes or additions to previous responses their AI applications often generate entirely new responses instead of adjusting
-previous ones. Arch offers ``intent tracking`` as a feature so that developers can know when the user has shifted away from a
-previous intent so that they can dramatically improve retrieval accuracy, lower overall token cost and  improve the speed of
-their responses back to users.
+Multi-Turn RAG (Follow-up Questions)
+-------------------------------------
+Developers often `struggle <https://www.reddit.com/r/LocalLLaMA/comments/18mqwg6/best_practice_for_rag_with_followup_chat/>`_ to efficiently handle
+``follow-up`` or ``clarification`` questions. Specifically, when users ask for changes or additions to previous responses, it requires developers to
+re-write prompts using LLMs with precise prompt engineering techniques. This process is slow, manual, error prone and adds signifcant latency to the
+user experience. Arch

-Arch uses its built-in lightweight NLI and embedding models to know if the user has steered away from an active intent.
-Arch's intent-drift detection mechanism is based on its :ref:`prompt target <prompt_target>` primtive. Arch tries to match an incoming
-prompt to one of the prompt_targets configured in the gateway. Once it detects that the user has moved away from an active
-active intent, Arch adds the ``x-arch-intent-marker`` headers to the request before sending it your application servers.
-
-.. literalinclude:: includes/rag/intent_detection.py
-    :language: python
-    :linenos:
-    :lines: 101-157
-    :emphasize-lines: 14-25
-    :caption: Intent Detection Example
-
-
-.. Note::
-
-   Arch is (mostly) stateless so that it can scale in an embarrassingly parrallel fashion. So, while Arch offers
-   intent-drift detetction, you still have to maintain converational state with intent drift as metadata. The
-   following code snippets show how easily you can build and enrich conversational history with Langchain (in Python),
-   so that you can use the most relevant prompts for your retrieval and for prompting upstream LLMs.
-
-
-Step 1: Define ConversationBufferMemory
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. literalinclude:: includes/rag/intent_detection.py
-    :language: python
-    :linenos:
-    :lines: 1-21
-
-Step 2: Update ConversationBufferMemory with Intents
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. literalinclude:: includes/rag/intent_detection.py
-    :language: python
-    :linenos:
-    :lines: 24-64
-
-Step 3: Get Messages based on latest drift
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. literalinclude:: includes/rag/intent_detection.py
-    :language: python
-    :linenos:
-    :lines: 67-80
-
-
-You can used the last set of messages that match to an intent to prompt an LLM, use it with an vector-DB for
-improved retrieval, etc. With Arch and a few lines of code, you can improve the retrieval accuracy, lower overall
-token cost and dramatically improve the speed of their responses back to users.
+Arch is highly capable of accurately detecting and processing prompts in a multi-turn scenarios so that you can buil fast and accurate RAG apps in
+minutes. For additional details on how to build multi-turn RAG applications please refer to our :ref:`multi-turn <arch_multi_turn_guide>` docs.
--- a/docs/source/concepts/tech_overview/prompt.rst
+++ b/docs/source/concepts/tech_overview/prompt.rst
@ -87,7 +87,7 @@ For more details on how you can build agentic applications using Arch, see our f
 .. Note::
   `Arch-Function <https://huggingface.co/collections/katanemo/arch-function-66f209a693ea8df14317ad68>`_ is a collection of dedicated agentic models engineered in Arch to extract information from a (set of) prompts and executes necessary backend API calls.
   This allows for efficient handling of agentic tasks, such as scheduling data retrieval, by dynamically interacting with backend services.
-   Arch-Function achieves state-of-the-art performance, comparable with frontier models like Claude Sonnet 3.5 ang GPT-4, while being 100x cheaper ($0.05M/token hosted) and 10x faster (p50 latencies of 200ms).
+   Arch-Function achieves state-of-the-art performance, comparable with frontier models like Claude Sonnet 3.5 ang GPT-4, while being 44x cheaper ($0.10M/token hosted) and 10x faster (p50 latencies of 200ms).

 Prompting LLMs
 --------------
--- a/docs/source/concepts/tech_overview/terminology.rst
+++ b/docs/source/concepts/tech_overview/terminology.rst
@ -3,8 +3,16 @@
 Terminology
 ============

-A few definitions before we dive into the main architecture documentation. Arch borrows from Envoy's terminology
-to keep things consistent in logs, traces and in code.
+A few definitions before we dive into the main architecture documentation. Also note, Arch borrows from Envoy's terminology
+to keep things consistent in logs and traces, and introduces and clarifies concepts are is relates to LLM applications.
+
+**Agent**: An application that uses LLMs to handle wide-ranging tasks from users via prompts. This could be as simple
+as retrieving or summarizing data from an API, or being able to trigger compleix actions like adjusting ad campaigns, or
+changing travel plans via prompts.
+
+**Arch Config**: Arch operates based on a configuration that controls the behavior of a single instance of the Arch gateway.
+This where you enable capabilities like LLM routing, fast function calling (via prompt_targets), applying guardrails, and enabling critical
+features like metrics and tracing. For the full configuration reference of `arch_config.yaml` see :ref:`here <configuration_refernce>`.

 **Downstream(Ingress)**: An downstream client (web application, etc.) connects to Arch, sends prompts, and receives responses.

@ -14,9 +22,10 @@ to keep things consistent in logs, traces and in code.
   :width: 100%
   :align: center

-**Listener**: A :ref:`listener <arch_overview_listeners>` is a named network location (e.g., port, address, path etc.) that Arch listens on to process prompts
-before forwarding them to your application server endpoints. rch enables you to configure one listener for downstream connections
-(like port 80, 443) and creates a separate internal listener for calls that initiate from your application code to LLMs.
+**Listener**: A :ref:`listener <arch_overview_listeners>` is a named network location (e.g., port, address, path etc.) that Arch
+listens on to process prompts before forwarding them to your application server endpoints. rch enables you to configure one listener
+for downstream connections (like port 80, 443) and creates a separate internal listener for calls that initiate from your application
+code to LLMs.

 .. Note::

@ -24,23 +33,18 @@ before forwarding them to your application server endpoints. rch enables you to
   that you can use (``127.0.0.1:12000``) to proxy egress calls originating from your application to LLMs (API-based or hosted).
   For more details, check out :ref:`LLM provider <llm_provider>`.

-**Instance**: An instance of the Arch gateway. When you start Arch it creates at most two processes. One to handle Layer 7
-networking operations (auth, tls, observability, etc) and the second process to serve models that enable it to make smart
-decisions on how to accept, handle and forward prompts. The second process is optional, as the model serving sevice could be
-hosted on a different network (an API call). But these two processes are considered a single instance of Arch.
+**Prompt Target**: Arch offers a primitive called :ref:`prompt target <prompt_target>` to help separate business logic from
+undifferentiated work in building generative AI apps. Prompt targets are endpoints that receive prompts that are processed by Arch.
+For example, Arch enriches incoming prompts with metadata like knowing when a request is a follow-up or clarifying prompt so that you
+can build faster, more accurate retrieval (RAG) apps. To support agentic apps, like scheduling travel plans or sharing comments on a
+document - via prompts, Arch uses its function calling abilities to extract critical information fromthe incoming prompt (or a set of
+prompts) needed by a downstream backend API or function call before calling it directly.

-**Prompt Target**: Arch offers a primitive called :ref:`prompt target <prompt_target>` to help separate business logic from undifferentiated
-work in building generative AI apps. Prompt targets are endpoints that receive prompts that are processed by Arch.
-For example, Arch enriches incoming prompts with metadata like knowing when a request is a follow-up or clarifying prompt
-so that you can build faster, more accurate retrieval (RAG) apps. To support agentic apps, like scheduling travel plans or
-sharing comments on a document - via prompts, Arch uses its function calling abilities to extract critical information from
-the incoming prompt (or a set of prompts) needed by a downstream backend API or function call before calling it directly.
+**Model Serving**: Arch is a set of `two` self-contained processes that are designed to run alongside your application servers
+(or on a separate hostconnected via a network).The :ref:`model serving <model_serving>` process helps Arch make intelligent decisions
+about the incoming prompts. The model server is designed to call the (fast) purpose-built LLMs in Arch.

 **Error Target**: :ref:`Error targets <error_target>` are those endpoints that receive forwarded errors from Arch when issues arise,
 such as failing to properly call a function/API, detecting violations of guardrails, or encountering other processing errors.
 These errors are communicated to the application via headers ``X-Arch-[ERROR-TYPE]``, allowing it to handle the errors gracefully
 and take appropriate actions.
-
-**Model Serving**: Arch is a set of `two` self-contained processes that are designed to run alongside your application servers
-(or on a separate hostconnected via a network).The :ref:`model serving <model_serving>` process helps Arch make intelligent decisions about the
-incoming prompts. The model server is designed to call the (fast) purpose-built LLMs in Arch.
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@ -15,7 +15,7 @@ from sphinxawesome_theme.postprocess import Icons
 project = "Arch Docs"
 copyright = "2024, Katanemo Labs, Inc"
 author = "Katanemo Labs, Inc"
-release = " v0.1.5"
+release = " v0.1.7"

 # -- General configuration ---------------------------------------------------
 # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration
--- a/docs/source/get_started/quickstart.rst
+++ b/docs/source/get_started/quickstart.rst
@ -25,7 +25,7 @@ Arch's CLI allows you to manage and interact with the Arch gateway efficiently.

   $ python -m venv venv
   $ source venv/bin/activate   # On Windows, use: venv\Scripts\activate
-   $ pip install archgw==0.1.6
+   $ pip install archgw==0.1.7


 Build AI Agent with Arch Gateway
--- a/docs/source/guides/observability/monitoring.rst
+++ b/docs/source/guides/observability/monitoring.rst
@ -3,7 +3,75 @@
 Monitoring
 ==========

-Arch offers several monitoring metrics that help you understand three critical aspects of your application:
-latency, token usage, and error rates by an upstream LLM provider. Latency measures the speed at which your
-application is responding to users, which includes metrics like time to first token (TFT), time per output
-token (TOT) metrics, and the total latency as perceived by users.
+`OpenTelemetry <https://opentelemetry.io/>`_ is an open-source observability framework providing APIs
+and instrumentation for generating, collecting, processing, and exporting telemetry data, such as traces,
+metrics, and logs. Its flexible design supports a wide range of backends and seamlessly integrates with
+modern application tools.
+
+Arch acts a *source* for several monitoring metrics related to **prompts** and **LLMs** natively integrated
+via `OpenTelemetry <https://opentelemetry.io/>`_ to help you understand three critical aspects of your application:
+latency, token usage, and error rates by an upstream LLM provider. Latency measures the speed at which your application
+is responding to users, which includes metrics like time to first token (TFT), time per output token (TOT) metrics, and
+the total latency as perceived by users. Below are some screenshots how Arch integrates natively with tools like
+`Grafana <https://grafana.com/grafana/dashboards/>`_ via `Promethus <https://prometheus.io/>`_
+
+
+Metrics Dashboard (via Grafana)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+.. image:: /_static/img/llm-request-metrics.png
+   :width: 100%
+   :align: center
+
+.. image:: /_static/img/input-token-metrics.png
+   :width: 100%
+   :align: center
+
+.. image:: /_static/img/output-token-metrics.png
+   :width: 100%
+   :align: center
+
+Configure Monitoring
+~~~~~~~~~~~~~~~~~~~~
+Arch gateway publishes stats endpoint at http://localhost:19901/stats. As noted above, Arch is a source for metrics. To view and manipulate dashbaords, you will
+need to configiure `Promethus <https://prometheus.io/>`_ (as a metrics store) and `Grafana <https://grafana.com/grafana/dashboards/>`_ for dashboards. Below
+are some sample configuration files for both, respectively.
+
+.. code-block:: yaml
+    :caption: Sample prometheus.yaml config file
+
+    global:
+    scrape_interval: 15s
+    scrape_timeout: 10s
+    evaluation_interval: 15s
+    alerting:
+    alertmanagers:
+        - static_configs:
+            - targets: []
+        scheme: http
+        timeout: 10s
+        api_version: v2
+    scrape_configs:
+    - job_name: archgw
+        honor_timestamps: true
+        scrape_interval: 15s
+        scrape_timeout: 10s
+        metrics_path: /stats
+        scheme: http
+        static_configs:
+        - targets:
+            - host.docker.internal:19901
+        params:
+        format: ["prometheus"]
+
+
+.. code-block:: yaml
+    :caption: Sample grafana datasource.yaml config file
+
+    apiVersion: 1
+    datasources:
+    - name: Prometheus
+        type: prometheus
+        url: http://prometheus:9090
+        isDefault: true
+        access: proxy
+        editable: true
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@ -14,7 +14,7 @@ Welcome to Arch!

   <a href="https://www.producthunt.com/posts/arch-3?embed=true&utm_source=badge-top-post-badge&utm_medium=badge&utm_souce=badge-arch&#0045;3" target="_blank"><img src="https://api.producthunt.com/widgets/embed-image/v1/top-post-badge.svg?post_id=565761&theme=light&period=daily" alt="Arch - Build&#0032;fast&#0044;&#0032;hyper&#0045;personalized&#0032;agents&#0032;with&#0032;intelligent&#0032;infra | Product Hunt" style="width: 250px; height: 54px;" width="250" height="54" /></a>

-`Arch <https://github.com/katanemo/arch>`_ is an intelligent infrastructure primitive for GenAI (built by the contributors of `Envoy <https://www.envoyproxy.io/>`_ ) that born out of the belief that:
+`Arch <https://github.com/katanemo/arch>`_ is an intelligent gateway for agents - an infrastructure primitive for GenAI (built by the contributors of `Envoy <https://www.envoyproxy.io/>`_ ). The project was born out of the belief that:

  *Prompts are nuanced and opaque user requests, which require the same capabilities as traditional HTTP requests including secure handling, intelligent routing, robust observability, and integration with backend (API) systems for personalization - all outside business logic.*

@ -62,6 +62,7 @@ Welcome to Arch!

      build_with_arch/agent
      build_with_arch/rag
+      build_with_arch/multi_turn

  .. tab-item:: Resources

--- a/docs/source/resources/configuration_reference.rst
+++ b/docs/source/resources/configuration_reference.rst
@ -1,11 +1,11 @@
-Configuration Reference
-============================
+.. _configuration_refernce:

-The following is a complete reference of the ``prompt-conifg.yml`` that controls the behavior of a single instance of
-the Arch gateway. We've kept things simple (less than 80 lines) and held off on exposing additional functionality (for
-e.g. suppporting push observability stats, managing prompt-endpoints as virtual cluster, exposing more load balancing
-options, etc). Our belief that the simple things, should be simple. So we offert good defaults for developers, so
-that they can spend more of their time in building features unique to their AI experience.
+Configuration Reference
+=======================
+
+The following is a complete reference of the ``arch_conifg.yml`` that controls the behavior of a single instance of
+the Arch gateway. This where you enable capabilities like routing to upstream LLm providers, defining prompt_targets
+where prompts get routed to, apply guardrails, and enable critical agent observability features.

 .. literalinclude:: includes/arch_config_full_reference.yaml
    :language: yaml
--- a/e2e_tests/.vscode/launch.json
+++ b/e2e_tests/.vscode/launch.json
@ -1,15 +0,0 @@
-{
-    // Use IntelliSense to learn about possible attributes.
-    // Hover to view descriptions of existing attributes.
-    // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
-    "version": "0.2.0",
-    "configurations": [
-        {
-            "name": "Python Debugger: Current File",
-            "type": "debugpy",
-            "request": "launch",
-            "program": "${file}",
-            "console": "integratedTerminal"
-        }
-    ]
-}
--- a/e2e_tests/.vscode/settings.json
+++ b/e2e_tests/.vscode/settings.json
@ -1,7 +0,0 @@
-{
-  "python.testing.pytestArgs": [
-    "."
-  ],
-  "python.testing.unittestEnabled": false,
-  "python.testing.pytestEnabled": true
-}
--- a/e2e_tests/api_model_server.rest
+++ b/e2e_tests/api_model_server.rest
@ -1,44 +0,0 @@
-@model_server_endpoint = http://localhost:51000
-@archfc_endpoint = https://api.fc.archgw.com
-
-### talk to model_server for completion
-POST {{model_server_endpoint}}/v1/chat/completions HTTP/1.1
-Content-Type: application/json
-
-{
-  "messages": [
-    {
-      "role": "user",
-      "content": "how is the weather in seattle for next 10 days"
-    }
-  ],
-  "tools": [
-    {
-        "id": "weather-112",
-        "tool_type": "function",
-        "function": {
-          "name": "weather_forecast",
-          "arguments": {"city": "str", "days": "int"}
-        }
-    }
-  ]
-}
-
-
-### talk to arch_fc directly for completion
-POST {{archfc_endpoint}}/v1/chat/completions HTTP/1.1
-Content-Type: application/json
-
-{
-  "model": "Arch-Function",
-  "messages": [
-    {
-      "role": "system",
-      "content": "You are a helpful assistant.\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>\n{\"id\": \"weather-112\", \"tool_type\": \"function\", \"function\": {\"name\": \"weather_forecast\", \"arguments\": {\"city\": \"str\", \"days\": \"int\"}}}\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call>"
-    },
-    { "role": "user", "content": "how is the weather in seattle?" },
-    { "role": "assistant", "content": "Of course! " }
-  ],
-  "continue_final_message": true,
-  "add_generation_prompt": false
-}
--- a/model_server/.vscode/launch.json
+++ b/model_server/.vscode/launch.json
@ -9,7 +9,7 @@
      "type": "debugpy",
      "request": "launch",
      "module": "uvicorn",
-      "args": ["app.main:app","--reload", "--port", "51000"]
+      "args": ["src.main:app","--reload", "--port", "51000"]
    }
  ]
 }
--- a/model_server/Dockerfile
+++ b/model_server/Dockerfile
@ -15,7 +15,7 @@ WORKDIR /src
 # specify list of models that will go into the image as a comma separated list
 # following models have been tested to work with this image
 # "sentence-transformers/all-MiniLM-L6-v2,sentence-transformers/all-mpnet-base-v2,thenlper/gte-base,thenlper/gte-large,thenlper/gte-small"
-ENV MODELS="katanemo/bge-large-en-v1.5-onnx"
+ENV MODELS=""

 COPY ./app ./app
 COPY ./app/guard_model_config.yaml .
@ -28,4 +28,4 @@ COPY ./app/openai_params.yaml .
 # RUN python install.py && \
 #   find /root/.cache/torch/sentence_transformers/ -name onnx -exec rm -rf {} +

-CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "80"]
+CMD ["uvicorn", "src.app.main:app", "--host", "0.0.0.0", "--port", "80"]
--- a/model_server/Dockerfile.gpu
+++ b/model_server/Dockerfile.gpu
@ -45,7 +45,7 @@ RUN if command -v nvcc >/dev/null 2>&1; then \
 COPY . /src

 # Specify list of models that will go into the image as a comma separated list
-ENV MODELS="katanemo/bge-large-en-v1.5-onnx"
+ENV MODELS=""
 ENV DEBIAN_FRONTEND=noninteractive

 COPY /app /app
--- a/Show more
+++ b/Show more