Merge branch 'main' into adil/add_acm_demo

2026-06-17 15:25:17 +02:00 · 2025-04-15 15:12:46 -07:00 · 2025-04-15 15:12:46 -07:00 · 6edad0870b
commit 6edad0870b
parent 9cb04756c5 c7c0553427
125 changed files with 6680 additions and 2314 deletions
--- a/.github/workflows/docker-push-main.yml
+++ b/.github/workflows/docker-push-main.yml
@ -1,4 +1,4 @@
-name: Publish Docker image
+name: Publish docker image (latest)

 env:
  DOCKER_IMAGE: katanemo/archgw
--- a/.github/workflows/docker-push-release.yml
+++ b/.github/workflows/docker-push-release.yml
@ -0,0 +1,87 @@
+name: Publish docker image (release)
+
+on:
+  release:
+    types: [published]
+
+jobs:
+  # Build ARM64 image on native ARM64 runner
+  build-arm64:
+    runs-on: [linux-arm64]
+    steps:
+      - name: Checkout Repository
+        uses: actions/checkout@v4
+
+      - name: Log in to Docker Hub
+        uses: docker/login-action@v3
+        with:
+          username: ${{ secrets.DOCKER_USERNAME }}
+          password: ${{ secrets.DOCKER_PASSWORD }}
+
+      - name: Extract metadata (tags, labels) for Docker
+        id: meta
+        uses: docker/metadata-action@v5
+        with:
+          images: ${{ env.DOCKER_IMAGE }}
+
+      - name: Build and Push ARM64 Image
+        uses: docker/build-push-action@v5
+        with:
+          context: .
+          file: ./arch/Dockerfile
+          platforms: linux/arm64
+          push: true
+          tags: ${{ steps.meta.outputs.tags }}-arm64
+
+  # Build AMD64 image on GitHub's AMD64 runner
+  build-amd64:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout Repository
+        uses: actions/checkout@v4
+
+      - name: Log in to Docker Hub
+        uses: docker/login-action@v3
+        with:
+          username: ${{ secrets.DOCKER_USERNAME }}
+          password: ${{ secrets.DOCKER_PASSWORD }}
+
+      - name: Extract metadata (tags, labels) for Docker
+        id: meta
+        uses: docker/metadata-action@v5
+        with:
+          images: ${{ env.DOCKER_IMAGE }}
+
+      - name: Build and Push AMD64 Image
+        uses: docker/build-push-action@v5
+        with:
+          context: .
+          file: ./arch/Dockerfile
+          platforms: linux/amd64
+          push: true
+          tags: ${{ steps.meta.outputs.tags }}-amd64
+
+
+  # Combine ARM64 and AMD64 images into a multi-arch manifest
+  create-manifest:
+    runs-on: ubuntu-latest
+    needs: [build-arm64, build-amd64]  # Wait for both builds
+    steps:
+      - name: Log in to Docker Hub
+        uses: docker/login-action@v3
+        with:
+          username: ${{ secrets.DOCKER_USERNAME }}
+          password: ${{ secrets.DOCKER_PASSWORD }}
+
+      - name: Extract metadata (tags, labels) for Docker
+        id: meta
+        uses: docker/metadata-action@v5
+        with:
+          images: ${{ env.DOCKER_IMAGE }}
+
+      - name: Create Multi-Arch Manifest
+        run: |
+          # Combine the architecture-specific images into a single manifest
+          docker buildx imagetools create -t ${{ steps.meta.outputs.tags }} \
+            ${{ env.DOCKER_IMAGE }}:arm64 \
+            ${{ env.DOCKER_IMAGE }}:amd64
--- a/.github/workflows/e2e_archgw.yml
+++ b/.github/workflows/e2e_archgw.yml
@ -30,6 +30,7 @@ jobs:
        env:
          OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
          MISTRAL_API_KEY: ${{ secrets.MISTRAL_API_KEY }}
+          GROQ_API_KEY: ${{ secrets.GROQ_API_KEY }}
        run: |
          docker compose up | tee &> archgw.logs &

@ -55,5 +56,6 @@ jobs:
        env:
          OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
          MISTRAL_API_KEY: ${{ secrets.MISTRAL_API_KEY }}
+          GROQ_API_KEY: ${{ secrets.GROQ_API_KEY }}
        run: |
          docker compose down
--- a/.github/workflows/e2e_test_demos.yml
+++ b/.github/workflows/e2e_test_demos.yml
@ -32,6 +32,11 @@ jobs:
        run: |
          python -m venv venv

+      - name: install hurl
+        run: |
+          curl --location --remote-name https://github.com/Orange-OpenSource/hurl/releases/download/4.0.0/hurl_4.0.0_amd64.deb
+          sudo dpkg -i hurl_4.0.0_amd64.deb
+
      - name: install model server, arch gateway and test dependencies
        run: |
          source venv/bin/activate
@ -43,6 +48,7 @@ jobs:
        env:
          OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
          MISTRAL_API_KEY: ${{ secrets.MISTRAL_API_KEY }}
+          GROQ_API_KEY: ${{ secrets.GROQ_API_KEY }}
        run: |
          source venv/bin/activate
          cd demos/shared/test_runner && sh run_demo_tests.sh
--- a/.github/workflows/e2e_tests.yml
+++ b/.github/workflows/e2e_tests.yml
@ -29,6 +29,7 @@ jobs:
        env:
          OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
          MISTRAL_API_KEY: ${{ secrets.MISTRAL_API_KEY }}
+          GROQ_API_KEY: ${{ secrets.GROQ_API_KEY }}
        run: |
          python -mvenv venv
          source venv/bin/activate && cd tests/e2e && bash run_e2e_tests.sh
--- a/.github/workflows/validate_arch_config.yml
+++ b/.github/workflows/validate_arch_config.yml
@ -0,0 +1,31 @@
+name: arch config tests
+
+on:
+  push:
+    branches:
+      - main
+  pull_request:
+
+jobs:
+  validate_arch_config:
+    runs-on: ubuntu-latest
+    defaults:
+      run:
+        working-directory: .
+
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v3
+
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: "3.12"
+
+      - name: build arch docker image
+        run: |
+          docker build  -f arch/Dockerfile . -t katanemo/archgw
+
+      - name: validate arch config
+        run: |
+          bash arch/validate_arch_config.sh
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@ -1,4 +1,5 @@
-#  Contribution
+# Contribution
+
 We would love feedback on our [Roadmap](https://github.com/orgs/katanemo/projects/1) and we welcome contributions to **Arch**!
 Whether you're fixing bugs, adding new features, improving documentation, or creating tutorials, your help is much appreciated.

@ -22,7 +23,9 @@ $ cd arch
 ```

 ### 3. Create a branch
+
 Use a descriptive name for your branch (e.g., fix-bug-123, add-feature-x).
+
 ```bash
 $ git checkout -b <your-branch-name>
 ```
@ -32,6 +35,7 @@ $ git checkout -b <your-branch-name>
 Make your changes in the relevant files. If you're adding new features or fixing bugs, please include tests where applicable.

 ### 5. Test your changes
+
 ```bash
 cd arch
 cargo test
@ -51,4 +55,4 @@ Contribution Guidelines
    Follow the existing coding style.
    Update documentation as needed.

-To get in touch with us, please join our [discord server](https://discord.gg/rbjqVbpa). We will be monitoring that actively and offering support there.
+To get in touch with us, please join our [discord server](https://discord.gg/pGZf2gcwEc). We will be monitoring that actively and offering support there.
--- a/README.md
+++ b/README.md
@ -3,7 +3,9 @@
 </div>
 <div align="center">

-_Arch is an intelligent (edge and LLM) proxy designed for agentic applications - to help you protect, observe, and build agentic tasks by simply connecting (existing) APIs._
+
+_The intelligent (edge and LLM) proxy server for agentic applications._<br><br>
+Move faster by letting Arch handle the **pesky** heavy lifting in building agents: fast input clarification, agent routing, seamless integration of prompts with tools for common tasks, and unified access and observability of LLMs.

 [Quickstart](#Quickstart) •
 [Demos](#Demos) •
@ -16,26 +18,32 @@ _Arch is an intelligent (edge and LLM) proxy designed for agentic applications -
 [![rust tests (prompt and llm gateway)](https://github.com/katanemo/arch/actions/workflows/rust_tests.yml/badge.svg)](https://github.com/katanemo/arch/actions/workflows/rust_tests.yml)
 [![e2e tests](https://github.com/katanemo/arch/actions/workflows/e2e_tests.yml/badge.svg)](https://github.com/katanemo/arch/actions/workflows/e2e_tests.yml)
 [![Build and Deploy Documentation](https://github.com/katanemo/arch/actions/workflows/static.yml/badge.svg)](https://github.com/katanemo/arch/actions/workflows/static.yml)
+
+
 </div>

 # Overview
-<a href="https://www.producthunt.com/posts/arch-3?embed=true&utm_source=badge-top-post-badge&utm_medium=badge&utm_souce=badge-arch&#0045;3" target="_blank"><img src="https://api.producthunt.com/widgets/embed-image/v1/top-post-badge.svg?post_id=565761&theme=light&period=daily" alt="Arch - Build&#0032;fast&#0044;&#0032;hyper&#0045;personalized&#0032;agents&#0032;with&#0032;intelligent&#0032;infra | Product Hunt" style="width: 250px; height: 54px;" width="250" height="54" /></a>
+<a href="https://www.producthunt.com/posts/arch-3?embed=true&utm_source=badge-top-post-badge&utm_medium=badge&utm_souce=badge-arch&#0045;3" target="_blank"><img src="https://api.producthunt.com/widgets/embed-image/v1/top-post-badge.svg?post_id=565761&theme=dark&period=daily&t=1742359429995" alt="Arch - Build&#0032;fast&#0044;&#0032;hyper&#0045;personalized&#0032;agents&#0032;with&#0032;intelligent&#0032;infra | Product Hunt" style="width: 188px; height: 41px;" width="188" height="41" /></a>

+Past the thrill of an AI demo, have you found yourself hitting these walls? You know, the all too familiar ones:

-Arch Gateway was built by the contributors of [Envoy Proxy](https://www.envoyproxy.io/) with the belief that:
+- You go from one BIG prompt to specialized prompts, but get stuck building **routing and handoff** code?
+- You want use new LLMs, but struggle to **quickly and safely add LLMs** without writing integration code?
+- You're bogged down with prompt engineering just to **clarify user intent and validate inputs** effectively?
+- You're wasting cycles choosing and integrating code for **observability** instead of it happening transparently?

->Prompts are nuanced and opaque user requests, which require the same capabilities as traditional HTTP requests including secure handling, intelligent routing, robust observability, and integration with backend (API) systems for personalization – outside core business logic.*
+And you think to yourself, can't I move faster by focusing on higher-level objectives in a language/framework agnostic way? Well, you can! **Arch Gateway** was built by the contributors of [Envoy Proxy](https://www.envoyproxy.io/) with the belief that:

-
-Arch is engineered with purpose-built LLMs to handle critical but pesky tasks related to the handling and processing of prompts. This includes detecting and rejecting [jailbreak](https://github.com/verazuo/jailbreak_llms) attempts, intent-based routing for improved task accuracy, mapping user request into "backend" functions, and managing the observability of prompts and LLM API calls in a centralized way.
+>Prompts are nuanced and opaque user requests, which require the same capabilities as traditional HTTP requests including secure handling, intelligent routing, robust observability, and integration with backend (API) systems to improve speed and accuracy for common agentic scenarios  – all outside core application logic.*

 **Core Features**:

-  - **Intent-based prompt routing & fast ⚡ function-calling via APIs**. Engineered with purpose-built [LLMs](https://huggingface.co/collections/katanemo/arch-function-66f209a693ea8df14317ad68) to handle fast, cost-effective, and accurate prompt-based tasks like function/API calling, and parameter extraction from prompts to build more task-accurate agentic applications.
-  - **Prompt [Guard](https://huggingface.co/collections/katanemo/arch-guard-6702bdc08b889e4bce8f446d)**: Arch centralizes guardrails to prevent jailbreak attempts and ensure safe user interactions without writing a single line of code.
-  - **LLM Routing & Traffic Management**: Arch centralizes calls to LLMs used by your applications, offering smart retries, automatic cutover, and resilient upstream connections for continuous availability.
-  - **Observability**: Arch uses the W3C Trace Context standard to enable complete request tracing across applications, ensuring compatibility with observability tools, and provides metrics to monitor latency, token usage, and error rates, helping optimize AI application performance.
-  - **Built on [Envoy](https://envoyproxy.io)**: Arch runs alongside application servers as a separate containerized process, and builds on top of Envoy's proven HTTP management and scalability features to handle ingress and egress traffic related to prompts and LLMs.
+  - `🚦 Routing`. Engineered with purpose-built [LLMs](https://huggingface.co/collections/katanemo/arch-function-66f209a693ea8df14317ad68) for fast (<100ms) agent routing and hand-off scenarios
+  - `⚡ Tools Use`: For common agentic scenarios let Arch instantly clarify and convert prompts to tools/API calls
+  - `⛨ Guardrails`: Centrally configure and prevent harmful outcomes and ensure safe user interactions
+  - `🔗 Access to LLMs`: Centralize access and traffic to LLMs with smart retries for continuous availability
+  - `🕵 Observability`: W3C compatible request tracing and LLM metrics that instantly plugin with popular tools
+  - `🧱 Built on Envoy`: Arch runs alongside app servers as a containerized process, and builds on top of [Envoy's](https://envoyproxy.io) proven HTTP management and scalability features to handle ingress and egress traffic related to prompts and LLMs.

 **High-Level Sequence Diagram**:
 ![alt text](docs/source/_static/img/arch_network_diagram_high_level.png)
@ -73,7 +81,7 @@ Arch's CLI allows you to manage and interact with the Arch gateway efficiently.
 ```console
 $ python -m venv venv
 $ source venv/bin/activate   # On Windows, use: venv\Scripts\activate
-$ pip install archgw==0.2.1
+$ pip install archgw==0.2.6
 ```

 ### Build AI Agent with Arch Gateway
@ -143,7 +151,7 @@ $ archgw up arch_config.yaml
 2024-12-05 16:56:27,979 - cli.main - INFO - Starting archgw cli version: 0.1.5
 ...
 2024-12-05 16:56:28,485 - cli.utils - INFO - Schema validation successful!
-2024-12-05 16:56:28,485 - cli.main - INFO - Starging arch model server and arch gateway
+2024-12-05 16:56:28,485 - cli.main - INFO - Starting arch model server and arch gateway
 ...
 2024-12-05 16:56:51,647 - cli.core - INFO - Container is healthy!

@ -241,7 +249,7 @@ client = OpenAI(

 response = client.chat.completions.create(
    # we select model from arch_config file
-    model="--",
+    model="None",
    messages=[{"role": "user", "content": "What is the capital of France?"}],
 )

@ -301,6 +309,33 @@ Arch is designed to support best-in class observability by supporting open stand

 ![alt text](docs/source/_static/img/tracing.png)

+## Debugging
+
+When debugging issues / errors application logs and access logs provide key information to give you more context on whats going on with the system. Arch gateway runs in info log level and following is a typical output you could see in a typical interaction between developer and arch gateway,
+
+```
+$ archgw up --service archgw --foreground
+...
+[2025-03-26 18:32:01.350][26][info] prompt_gateway: on_http_request_body: sending request to model server
+[2025-03-26 18:32:01.851][26][info] prompt_gateway: on_http_call_response: model server response received
+[2025-03-26 18:32:01.852][26][info] prompt_gateway: on_http_call_response: dispatching api call to developer endpoint: weather_forecast_service, path: /weather, method: POST
+[2025-03-26 18:32:01.882][26][info] prompt_gateway: on_http_call_response: developer api call response received: status code: 200
+[2025-03-26 18:32:01.882][26][info] prompt_gateway: on_http_call_response: sending request to upstream llm
+[2025-03-26 18:32:01.883][26][info] llm_gateway: on_http_request_body: provider: gpt-4o-mini, model requested: None, model selected: gpt-4o-mini
+[2025-03-26 18:32:02.818][26][info] llm_gateway: on_http_response_body: time to first token: 1468ms
+[2025-03-26 18:32:04.532][26][info] llm_gateway: on_http_response_body: request latency: 3183ms
+...
+```
+
+Log level can be changed to debug to get more details. To enable debug logs edit (Dockerfile)[arch/Dockerfile], change the log level `--component-log-level wasm:info` to `--component-log-level wasm:debug`. And after that you need to rebuild docker image and restart the arch gateway using following set of commands,
+
+```
+# make sure you are at the root of the repo
+$ archgw build
+# go to your service that has arch_config.yaml file and issue following command,
+$ archgw up --service archgw --foreground
+```
+
 ## Contribution
 We would love feedback on our [Roadmap](https://github.com/orgs/katanemo/projects/1) and we welcome contributions to **Arch**!
 Whether you're fixing bugs, adding new features, improving documentation, or creating tutorials, your help is much appreciated.
--- a/arch/Dockerfile
+++ b/arch/Dockerfile
@ -28,4 +28,5 @@ COPY arch/arch_config_schema.yaml .
 RUN pip install requests
 RUN touch /var/log/envoy.log

-ENTRYPOINT ["sh","-c", "python config_generator.py && envsubst < /etc/envoy/envoy.yaml > /etc/envoy.env_sub.yaml && envoy -c /etc/envoy.env_sub.yaml --component-log-level wasm:debug 2>&1 | tee /var/log/envoy.log"]
+# ENTRYPOINT ["sh","-c", "python config_generator.py && envsubst < /etc/envoy/envoy.yaml > /etc/envoy.env_sub.yaml && envoy -c /etc/envoy.env_sub.yaml --log-level trace 2>&1 | tee /var/log/envoy.log"]
+ENTRYPOINT ["sh","-c", "python config_generator.py && envsubst < /etc/envoy/envoy.yaml > /etc/envoy.env_sub.yaml && envoy -c /etc/envoy.env_sub.yaml --component-log-level wasm:info 2>&1 | tee /var/log/envoy.log"]
--- a/arch/arch_config_schema.yaml
+++ b/arch/arch_config_schema.yaml
@ -5,8 +5,9 @@ properties:
    type: string
  listeners:
    type: object
+    additionalProperties: false
    properties:
-      prompt_gateway:
+      ingress_traffic:
        type: object
        properties:
          address:
@ -20,7 +21,7 @@ properties:
          timeout:
            type: string
        additionalProperties: false
-      llm_gateway:
+      egress_traffic:
        type: object
        properties:
          address:
@ -31,7 +32,6 @@ properties:
            type: string
            enum:
              - openai
-              - huggingface
          timeout:
            type: string
        additionalProperties: false
@ -62,7 +62,7 @@ properties:
      properties:
        name:
          type: string
-        # this field is deprecated, use provider_interface instead
+        # provider field is deprecated, use provider_interface instead
        provider:
          type: string
          enum:
@ -78,8 +78,11 @@ properties:
          type: string
        default:
          type: boolean
+        # endpoint field is deprecated, use base_url instead
        endpoint:
          type: string
+        base_url:
+          type: string
        protocol:
          type: string
          enum:
@ -90,7 +93,6 @@ properties:
      additionalProperties: false
      required:
        - name
-        - model
  overrides:
    type: object
    properties:
@ -98,6 +100,8 @@ properties:
        type: number
      optimize_context_window:
        type: boolean
+      use_agent_orchestrator:
+        type: boolean
  system_prompt:
    type: string
  prompt_targets:
@ -124,7 +128,10 @@ properties:
              required:
                type: boolean
              default:
-                type: string
+                anyOf:
+                  - type: string
+                  - type: integer
+                  - type: boolean
              description:
                type: string
              type:
@ -132,7 +139,10 @@ properties:
              enum:
                type: array
                items:
-                  type: string
+                  anyOf:
+                    - type: string
+                    - type: integer
+                    - type: boolean
              in_path:
                type: boolean
              format:
@ -241,5 +251,4 @@ properties:
 additionalProperties: false
 required:
  - version
-  - listeners
  - llm_providers
--- a/arch/envoy.template.yaml
+++ b/arch/envoy.template.yaml
@ -29,7 +29,7 @@ stats_config:
      - 180000
 static_resources:
  listeners:
-    - name: arch_listener_http
+    - name: ingress_traffic
      address:
        socket_address:
          address: {{ prompt_gateway_listener.address }}
@ -55,7 +55,7 @@ static_resources:
                  random_sampling:
                    value: {{ arch_tracing.random_sampling }}
                {% endif %}
-                stat_prefix: arch_listener_http
+                stat_prefix: ingress_traffic
                codec_type: AUTO
                scheme_header_transformation:
                  scheme_to_overwrite: https
@ -82,7 +82,7 @@ static_resources:
                    typed_config:
                      "@type": type.googleapis.com/envoy.extensions.filters.http.router.v3.Router

-    - name: arch_prompt_gateway_listener
+    - name: ingress_traffic_prompt
      address:
        socket_address:
          address: 0.0.0.0
@ -104,11 +104,11 @@ static_resources:
                        envoy_grpc:
                          cluster_name: opentelemetry_collector
                        timeout: 0.250s
-                      service_name: prompt_processor
+                      service_name: ingress_traffic
                  random_sampling:
                    value: {{ arch_tracing.random_sampling }}
                {% endif %}
-                stat_prefix: arch_prompt_gateway_listener
+                stat_prefix: ingress_traffic
                codec_type: AUTO
                scheme_header_transformation:
                  scheme_to_overwrite: https
@ -142,6 +142,19 @@ static_resources:
                            cluster: {{ llm_cluster_name }}
                            timeout: 60s
                      {% endfor %}
+
+                      {% if agent_orchestrator %}
+                        - match:
+                            prefix: "/"
+                            headers:
+                              - name: "x-arch-llm-provider"
+                                string_match:
+                                  exact: {{ agent_orchestrator }}
+                          route:
+                            auto_host_rewrite: true
+                            cluster: {{ agent_orchestrator }}
+                            timeout: 60s
+                      {% endif %}
                http_filters:
                  - name: envoy.filters.http.compressor
                    typed_config:
@ -201,7 +214,7 @@ static_resources:
                    typed_config:
                      "@type": type.googleapis.com/envoy.extensions.filters.http.router.v3.Router

-    - name: arch_internal
+    - name: egress_api_traffic
      address:
        socket_address:
          address: 0.0.0.0
@ -223,11 +236,11 @@ static_resources:
                        envoy_grpc:
                          cluster_name: opentelemetry_collector
                        timeout: 0.250s
-                      service_name: prompt_processor
+                      service_name: egress_api_traffic
                  random_sampling:
                    value: {{ arch_tracing.random_sampling }}
                {% endif %}
-                stat_prefix: arch_internal
+                stat_prefix: egress_api_traffic
                codec_type: AUTO
                scheme_header_transformation:
                  scheme_to_overwrite: https
@ -273,12 +286,12 @@ static_resources:
                    typed_config:
                      "@type": type.googleapis.com/envoy.extensions.filters.http.router.v3.Router

-    - name: arch_listener_http_llm
+    - name: egress_traffic
      address:
        socket_address:
          address: {{ llm_gateway_listener.address }}
          port_value: {{ llm_gateway_listener.port }}
-      traffic_direction: INBOUND
+      traffic_direction: OUTBOUND
      filter_chains:
        - filters:
            - name: envoy.filters.network.http_connection_manager
@ -299,7 +312,7 @@ static_resources:
                  random_sampling:
                    value: {{ arch_tracing.random_sampling }}
                {% endif %}
-                stat_prefix: arch_listener_http
+                stat_prefix: egress_traffic
                codec_type: AUTO
                scheme_header_transformation:
                  scheme_to_overwrite: https
@ -326,7 +339,7 @@ static_resources:
                    typed_config:
                      "@type": type.googleapis.com/envoy.extensions.filters.http.router.v3.Router

-    - name: arch_listener_llm
+    - name: egress_traffic_llm
      address:
        socket_address:
          address: 0.0.0.0
@ -347,11 +360,11 @@ static_resources:
                        envoy_grpc:
                          cluster_name: opentelemetry_collector
                        timeout: 0.250s
-                      service_name: llm_gateway
+                      service_name: egress_traffic_llm
                  random_sampling:
                    value: {{ arch_tracing.random_sampling }}
                {% endif %}
-                stat_prefix: arch_listener_http
+                stat_prefix: egress_traffic
                codec_type: AUTO
                scheme_header_transformation:
                  scheme_to_overwrite: https
--- a/arch/tools/README.md
+++ b/arch/tools/README.md
@ -19,7 +19,7 @@ source venv/bin/activate

 ### Step 3: Run the build script
 ```bash
-pip install archgw==0.2.1
+pip install archgw==0.2.6
 ```

 ## Uninstall Instructions: archgw CLI
--- a/arch/tools/cli/config_generator.py
+++ b/arch/tools/cli/config_generator.py
@ -3,6 +3,7 @@ import os
 from jinja2 import Environment, FileSystemLoader
 import yaml
 from jsonschema import validate
+from urllib.parse import urlparse

 ENVOY_CONFIG_TEMPLATE_FILE = os.getenv(
    "ENVOY_CONFIG_TEMPLATE_FILE", "envoy.template.yaml"
@ -47,7 +48,7 @@ def validate_and_render_schema():
        arch_config_schema = file.read()

    config_yaml = yaml.safe_load(arch_config)
-    config_schema_yaml = yaml.safe_load(arch_config_schema)
+    _ = yaml.safe_load(arch_config_schema)
    inferred_clusters = {}

    endpoints = config_yaml.get("endpoints", {})
@ -91,6 +92,9 @@ def validate_and_render_schema():
            del llm_provider["provider"]
        updated_llm_providers.append(llm_provider)

+        if llm_provider.get("endpoint") and llm_provider.get("base_url"):
+            raise Exception("Please provide either endpoint or base_url, not both")
+
        if llm_provider.get("endpoint", None):
            endpoint = llm_provider["endpoint"]
            protocol = llm_provider.get("protocol", "http")
@ -98,13 +102,39 @@ def validate_and_render_schema():
                endpoint, protocol
            )
            llms_with_endpoint.append(llm_provider)
+        elif llm_provider.get("base_url", None):
+            base_url = llm_provider["base_url"]
+            urlparse_result = urlparse(base_url)
+            if llm_provider.get("port"):
+                raise Exception("Please provider port in base_url")
+            if urlparse_result.scheme == "" or urlparse_result.scheme not in [
+                "http",
+                "https",
+            ]:
+                raise Exception(
+                    "Please provide a valid URL with scheme (http/https) in base_url"
+                )
+            protocol = urlparse_result.scheme
+            port = urlparse_result.port
+            if port is None:
+                if protocol == "http":
+                    port = 80
+                else:
+                    port = 443
+            endpoint = urlparse_result.hostname
+            llm_provider["endpoint"] = endpoint
+            llm_provider["port"] = port
+            llm_provider["protocol"] = protocol
+            llms_with_endpoint.append(llm_provider)

    config_yaml["llm_providers"] = updated_llm_providers

    arch_config_string = yaml.dump(config_yaml)
    arch_llm_config_string = yaml.dump(config_yaml)

-    prompt_gateway_listener = config_yaml.get("listeners", {}).get("prompt_gateway", {})
+    prompt_gateway_listener = config_yaml.get("listeners", {}).get(
+        "ingress_traffic", {}
+    )
    if prompt_gateway_listener.get("port") == None:
        prompt_gateway_listener["port"] = 10000  # default port for prompt gateway
    if prompt_gateway_listener.get("address") == None:
@ -112,7 +142,7 @@ def validate_and_render_schema():
    if prompt_gateway_listener.get("timeout") == None:
        prompt_gateway_listener["timeout"] = "10s"

-    llm_gateway_listener = config_yaml.get("listeners", {}).get("llm_gateway", {})
+    llm_gateway_listener = config_yaml.get("listeners", {}).get("egress_traffic", {})
    if llm_gateway_listener.get("port") == None:
        llm_gateway_listener["port"] = 12000  # default port for llm gateway
    if llm_gateway_listener.get("address") == None:
@ -120,6 +150,26 @@ def validate_and_render_schema():
    if llm_gateway_listener.get("timeout") == None:
        llm_gateway_listener["timeout"] = "10s"

+    use_agent_orchestrator = config_yaml.get("overrides", {}).get(
+        "use_agent_orchestrator", False
+    )
+
+    agent_orchestrator = None
+    if use_agent_orchestrator:
+        print("Using agent orchestrator")
+
+        if len(endpoints) == 0:
+            raise Exception(
+                "Please provide agent orchestrator in the endpoints section in your arch_config.yaml file"
+            )
+        elif len(endpoints) > 1:
+            raise Exception(
+                "Please provide single agent orchestrator in the endpoints section in your arch_config.yaml file"
+            )
+        else:
+            agent_orchestrator = list(endpoints.keys())[0]
+
+    print("agent_orchestrator: ", agent_orchestrator)
    data = {
        "prompt_gateway_listener": prompt_gateway_listener,
        "llm_gateway_listener": llm_gateway_listener,
@ -129,6 +179,7 @@ def validate_and_render_schema():
        "arch_llm_providers": config_yaml["llm_providers"],
        "arch_tracing": arch_tracing,
        "local_llms": llms_with_endpoint,
+        "agent_orchestrator": agent_orchestrator,
    }

    rendered = template.render(data)
--- a/arch/tools/cli/core.py
+++ b/arch/tools/cli/core.py
@ -2,110 +2,49 @@ import subprocess
 import os
 import time
 import sys
-import glob
-import docker
-from docker.errors import DockerException
-from cli.utils import getLogger, update_docker_host_env
+
+import yaml
+from cli.utils import getLogger
 from cli.consts import (
-    ARCHGW_DOCKER_IMAGE,
    ARCHGW_DOCKER_NAME,
    KATANEMO_LOCAL_MODEL_LIST,
-    MODEL_SERVER_LOG_FILE,
-    ACCESS_LOG_FILES,
 )
 from huggingface_hub import snapshot_download
-from dotenv import dotenv_values
-import yaml
+import subprocess
+from cli.docker_cli import (
+    docker_container_status,
+    docker_remove_container,
+    docker_start_archgw_detached,
+    docker_stop_container,
+    health_check_endpoint,
+    stream_gateway_logs,
+)


 log = getLogger(__name__)


-def start_archgw_docker(
-    client, arch_config_file, env, prompt_gateway_port, llm_gateway_port
-):
-    logs_path = "~/archgw_logs"
-    logs_path_abs = os.path.expanduser(logs_path)
+def _get_gateway_ports(arch_config_file: str) -> tuple:
+    PROMPT_GATEWAY_DEFAULT_PORT = 10000
+    LLM_GATEWAY_DEFAULT_PORT = 12000

-    return client.containers.run(
-        name=ARCHGW_DOCKER_NAME,
-        image=ARCHGW_DOCKER_IMAGE,
-        detach=True,  # Run in detached mode
-        ports={
-            f"{prompt_gateway_port}/tcp": prompt_gateway_port,
-            "10001/tcp": 10001,
-            "11000/tcp": 11000,
-            f"{llm_gateway_port}/tcp": llm_gateway_port,
-            "9901/tcp": 19901,
-        },
-        volumes={
-            f"{arch_config_file}": {
-                "bind": "/app/arch_config.yaml",
-                "mode": "ro",
-            },
-            "/etc/ssl/cert.pem": {"bind": "/etc/ssl/cert.pem", "mode": "ro"},
-            logs_path_abs: {"bind": "/var/log"},
-        },
-        environment={
-            "OTEL_TRACING_HTTP_ENDPOINT": "http://host.docker.internal:4318/v1/traces",
-            "MODEL_SERVER_PORT": os.getenv("MODEL_SERVER_PORT", "51000"),
-            **env,
-        },
-        extra_hosts={"host.docker.internal": "host-gateway"},
-        healthcheck={
-            "test": [
-                "CMD",
-                "curl",
-                "-f",
-                f"http://localhost:{prompt_gateway_port}/healthz",
-            ],
-            "interval": 5000000000,  # 5 seconds
-            "timeout": 1000000000,  # 1 seconds
-            "retries": 3,
-        },
+    # parse arch_config_file yaml file and get prompt_gateway_port
+    arch_config_dict = {}
+    with open(arch_config_file) as f:
+        arch_config_dict = yaml.safe_load(f)
+
+    prompt_gateway_port = (
+        arch_config_dict.get("listeners", {})
+        .get("ingress_traffic", {})
+        .get("port", PROMPT_GATEWAY_DEFAULT_PORT)
+    )
+    llm_gateway_port = (
+        arch_config_dict.get("listeners", {})
+        .get("egress_traffic", {})
+        .get("port", LLM_GATEWAY_DEFAULT_PORT)
    )

-
-def stream_gateway_logs(follow):
-    """
-    Stream logs from the arch gateway service.
-    """
-    log.info("Logs from arch gateway service.")
-
-    options = ["docker", "logs", "archgw"]
-    if follow:
-        options.append("-f")
-    try:
-        # Run `docker-compose logs` to stream logs from the gateway service
-        subprocess.run(
-            options,
-            check=True,
-            stdout=sys.stdout,
-            stderr=sys.stderr,
-        )
-
-    except subprocess.CalledProcessError as e:
-        log.info(f"Failed to stream logs: {str(e)}")
-
-
-def stream_access_logs(follow):
-    """
-    Get the archgw access logs
-    """
-    log_file_pattern_expanded = os.path.expanduser(ACCESS_LOG_FILES)
-    log_files = glob.glob(log_file_pattern_expanded)
-
-    stream_command = ["tail"]
-    if follow:
-        stream_command.append("-f")
-
-    stream_command.extend(log_files)
-    subprocess.run(
-        stream_command,
-        check=True,
-        stdout=sys.stdout,
-        stderr=sys.stderr,
-    )
+    return prompt_gateway_port, llm_gateway_port


 def start_arch(arch_config_file, env, log_timeout=120, foreground=False):
@ -119,73 +58,58 @@ def start_arch(arch_config_file, env, log_timeout=120, foreground=False):
    log.info("Starting arch gateway")

    try:
-        try:
-            client = docker.from_env()
-        except DockerException as e:
-            # try setting up the docker host environment variable and retry
-            update_docker_host_env()
-            client = docker.from_env()
+        archgw_container_status = docker_container_status(ARCHGW_DOCKER_NAME)
+        if archgw_container_status != "not found":
+            log.info("archgw found in docker, stopping and removing it")
+            docker_stop_container(ARCHGW_DOCKER_NAME)
+            docker_remove_container(ARCHGW_DOCKER_NAME)

-        try:
-            container = client.containers.get("archgw")
-            log.info("archgw container found in docker, stopping and removing it")
-            # ensure that previous docker container is stopped and removed
-            container.stop()
-            container.remove()
-            log.info("Stopped and removed archgw container")
-        except docker.errors.NotFound as e:
-            pass
+        prompt_gateway_port, llm_gateway_port = _get_gateway_ports(arch_config_file)

-        # parse arch_config_file yaml file and get prompt_gateway_port
-        arch_config_dict = {}
-        with open(arch_config_file) as f:
-            arch_config_dict = yaml.safe_load(f)
-
-        prompt_gateway_port = (
-            arch_config_dict.get("listeners", {})
-            .get("prompt_gateway", {})
-            .get("port", 10000)
-        )
-        llm_gateway_port = (
-            arch_config_dict.get("listeners", {})
-            .get("llm_gateway", {})
-            .get("port", 12000)
-        )
-
-        container = start_archgw_docker(
-            client, arch_config_file, env, prompt_gateway_port, llm_gateway_port
+        return_code, _, archgw_stderr = docker_start_archgw_detached(
+            arch_config_file,
+            os.path.expanduser("~/archgw_logs"),
+            env,
+            prompt_gateway_port,
+            llm_gateway_port,
        )
+        if return_code != 0:
+            log.info("Failed to start arch gateway: " + str(return_code))
+            log.info("stderr: " + archgw_stderr)
+            sys.exit(1)

        start_time = time.time()
-
        while True:
-            container = client.containers.get(container.id)
+            prompt_gateway_health_check_status = health_check_endpoint(
+                f"http://localhost:{prompt_gateway_port}/healthz"
+            )
+
+            llm_gateway_health_check_status = health_check_endpoint(
+                f"http://localhost:{llm_gateway_port}/healthz"
+            )
+
+            archgw_status = docker_container_status(ARCHGW_DOCKER_NAME)
            current_time = time.time()
            elapsed_time = current_time - start_time

            # Check if timeout is reached
            if elapsed_time > log_timeout:
-                log.info(f"Stopping log monitoring after {log_timeout} seconds.")
+                log.info(f"stopping log monitoring after {log_timeout} seconds.")
                break

-            container_status = container.attrs["State"]["Health"]["Status"]
-
-            if container_status == "healthy":
-                log.info("Container is healthy!")
+            if prompt_gateway_health_check_status or llm_gateway_health_check_status:
+                log.info("archgw is running and is healthy!")
                break
            else:
-                log.info(f"Container health status: {container_status}")
+                log.info(f"archgw status: {archgw_status}, health status: starting")
                time.sleep(1)

        if foreground:
-            for line in container.logs(stream=True):
-                print(line.decode("utf-8").strip("\n"))
+            stream_gateway_logs(follow=True)

    except KeyboardInterrupt:
        log.info("Keyboard interrupt received, stopping arch gateway service.")
        stop_arch()
-    except docker.errors.APIError as e:
-        log.info(f"Failed to start Arch: {str(e)}")


 def stop_arch():
@ -199,10 +123,10 @@ def stop_arch():

    try:
        subprocess.run(
-            ["docker", "stop", "archgw"],
+            ["docker", "stop", ARCHGW_DOCKER_NAME],
        )
        subprocess.run(
-            ["docker", "remove", "archgw"],
+            ["docker", "rm", ARCHGW_DOCKER_NAME],
        )

        log.info("Successfully shut down arch gateway service.")
--- a/arch/tools/cli/docker_cli.py
+++ b/arch/tools/cli/docker_cli.py
@ -0,0 +1,133 @@
+import subprocess
+import json
+import sys
+import requests
+
+from cli.consts import ARCHGW_DOCKER_IMAGE, ARCHGW_DOCKER_NAME
+from cli.utils import getLogger
+
+log = getLogger(__name__)
+
+
+def docker_container_status(container: str) -> str:
+    result = subprocess.run(
+        ["docker", "inspect", "--type=container", container],
+        capture_output=True,
+        text=True,
+        check=False,
+    )
+    if result.returncode != 0:
+        return "not found"
+
+    container_status = json.loads(result.stdout)[0]
+    return container_status.get("State", {}).get("Status", "")
+
+
+def docker_stop_container(container: str) -> str:
+    result = subprocess.run(
+        ["docker", "stop", container], capture_output=True, text=True, check=False
+    )
+    return result.returncode
+
+
+def docker_remove_container(container: str) -> str:
+    result = subprocess.run(
+        ["docker", "rm", container], capture_output=True, text=True, check=False
+    )
+    return result.returncode
+
+
+def docker_start_archgw_detached(
+    arch_config_file: str,
+    logs_path_abs: str,
+    env: dict,
+    prompt_gateway_port,
+    llm_gateway_port,
+) -> str:
+    env_args = [item for key, value in env.items() for item in ["-e", f"{key}={value}"]]
+
+    port_mappings = [
+        f"{prompt_gateway_port}:{prompt_gateway_port}",
+        f"{llm_gateway_port}:{llm_gateway_port}",
+        "9901:19901",
+    ]
+    port_mappings_args = [item for port in port_mappings for item in ("-p", port)]
+
+    volume_mappings = [
+        f"{logs_path_abs}:/var/log:rw",
+        f"{arch_config_file}:/app/arch_config.yaml:ro",
+        # "/Users/adilhafeez/src/intelligent-prompt-gateway/crates/target/wasm32-wasip1/release:/etc/envoy/proxy-wasm-plugins:ro",
+    ]
+    volume_mappings_args = [
+        item for volume in volume_mappings for item in ("-v", volume)
+    ]
+
+    options = [
+        "docker",
+        "run",
+        "-d",
+        "--name",
+        ARCHGW_DOCKER_NAME,
+        *port_mappings_args,
+        *volume_mappings_args,
+        *env_args,
+        "--add-host",
+        "host.docker.internal:host-gateway",
+        ARCHGW_DOCKER_IMAGE,
+    ]
+
+    result = subprocess.run(options, capture_output=True, text=True, check=False)
+    return result.returncode, result.stdout, result.stderr
+
+
+def health_check_endpoint(endpoint: str) -> bool:
+    try:
+        response = requests.get(endpoint)
+        if response.status_code == 200:
+            return True
+    except requests.RequestException as e:
+        pass
+    return False
+
+
+def stream_gateway_logs(follow):
+    """
+    Stream logs from the arch gateway service.
+    """
+    log.info("Logs from arch gateway service.")
+
+    options = ["docker", "logs"]
+    if follow:
+        options.append("-f")
+    options.append(ARCHGW_DOCKER_NAME)
+    try:
+        # Run `docker-compose logs` to stream logs from the gateway service
+        subprocess.run(
+            options,
+            check=True,
+            stdout=sys.stdout,
+            stderr=sys.stderr,
+        )
+
+    except subprocess.CalledProcessError as e:
+        log.info(f"Failed to stream logs: {str(e)}")
+
+
+def docker_validate_archgw_schema(arch_config_file):
+    result = subprocess.run(
+        [
+            "docker",
+            "run",
+            "--rm",
+            "-v",
+            f"{arch_config_file}:/app/arch_config.yaml:ro",
+            "--entrypoint",
+            "python",
+            ARCHGW_DOCKER_IMAGE,
+            "config_generator.py",
+        ],
+        capture_output=True,
+        text=True,
+        check=False,
+    )
+    return result.returncode, result.stdout, result.stderr
--- a/arch/tools/cli/main.py
+++ b/arch/tools/cli/main.py
@ -5,11 +5,12 @@ import subprocess
 import multiprocessing
 import importlib.metadata
 from cli import targets
+from cli.docker_cli import docker_validate_archgw_schema, stream_gateway_logs
 from cli.utils import (
    getLogger,
    get_llm_provider_access_keys,
    load_env_file_to_dict,
-    validate_schema,
+    stream_access_logs,
 )
 from cli.core import (
    start_arch_modelserver,
@ -17,12 +18,9 @@ from cli.core import (
    start_arch,
    stop_arch,
    download_models_from_hf,
-    stream_access_logs,
-    stream_gateway_logs,
 )
 from cli.consts import (
    KATANEMO_DOCKERHUB_REPO,
-    KATANEMO_LOCAL_MODEL_LIST,
    SERVICE_NAME_ARCHGW,
    SERVICE_NAME_MODEL_SERVER,
    SERVICE_ALL,
@ -174,17 +172,24 @@ def up(file, path, service, foreground):

    log.info(f"Validating {arch_config_file}")

-    try:
-        validate_schema(arch_config_file)
-    except Exception as e:
-        log.info(f"Exiting archgw up: validation failed")
-        log.info(f"Error: {str(e)}")
+    (
+        validation_return_code,
+        validation_stdout,
+        validation_stderr,
+    ) = docker_validate_archgw_schema(arch_config_file)
+    if validation_return_code != 0:
+        log.info(f"Error: Validation failed. Exiting")
+        log.info(f"Validation stdout: {validation_stdout}")
+        log.info(f"Validation stderr: {validation_stderr}")
        sys.exit(1)

    log.info("Starting arch model server and arch gateway")

    # Set the ARCH_CONFIG_FILE environment variable
-    env_stage = {}
+    env_stage = {
+        "OTEL_TRACING_HTTP_ENDPOINT": "http://host.docker.internal:4318/v1/traces",
+        "MODEL_SERVER_PORT": os.getenv("MODEL_SERVER_PORT", "51000"),
+    }
    env = os.environ.copy()
    # check if access_keys are preesnt in the config file
    access_keys = get_llm_provider_access_keys(arch_config_file=arch_config_file)
--- a/arch/tools/cli/targets.py
+++ b/arch/tools/cli/targets.py
@ -2,7 +2,6 @@ import ast
 import sys
 import yaml
 from typing import Any
-from pydantic import BaseModel

 FLASK_ROUTE_DECORATORS = ["route", "get", "post", "put", "delete", "patch"]
 FASTAPI_ROUTE_DECORATORS = ["get", "post", "put", "delete", "patch"]
--- a/arch/tools/cli/utils.py
+++ b/arch/tools/cli/utils.py
@ -1,10 +1,11 @@
+import glob
 import os
+import subprocess
+import sys
 import yaml
 import logging
-import docker
-from docker.errors import DockerException

-from cli.consts import ARCHGW_DOCKER_IMAGE, ARCHGW_DOCKER_NAME
+from cli.consts import ACCESS_LOG_FILES

 logging.basicConfig(
    level=logging.INFO,
@ -21,63 +22,6 @@ def getLogger(name="cli"):
 log = getLogger(__name__)


-def update_docker_host_env():
-    """
-    Update DOCKER_HOST environment variable to use the local Docker socket
-    """
-    if os.getenv("DOCKER_HOST"):
-        return
-
-    default_docker_socket = os.getenv("DEFAULT_DOCKER_SOCKET", "/var/run/docker.sock")
-    if not os.path.exists(default_docker_socket):
-        home_dir = os.getenv("HOME")
-        docker_host = f"unix://{home_dir}/.docker/run/docker.sock"
-        log.info(
-            f"Default docker socket {default_docker_socket} not found, using {docker_host}"
-        )
-        os.environ["DOCKER_HOST"] = docker_host
-
-
-def validate_schema(arch_config_file: str) -> None:
-    try:
-        try:
-            client = docker.from_env()
-        except DockerException as e:
-            # try setting up the docker host environment variable and retry
-            update_docker_host_env()
-            client = docker.from_env()
-
-        container = client.containers.run(
-            image=ARCHGW_DOCKER_IMAGE,
-            volumes={
-                f"{arch_config_file}": {
-                    "bind": "/app/arch_config.yaml",
-                    "mode": "ro",
-                },
-            },
-            entrypoint=["python", "config_generator.py"],
-            detach=True,
-        )
-
-        # Wait for the container to finish and get the exit code
-        exit_code = container.wait()
-
-        # Check exit code for validation success
-        if exit_code["StatusCode"] != 0:
-            # Validation failed (non-zero exit code)
-            logs = container.logs().decode()  # Get container logs for debugging
-            raise ValueError(
-                f"Validation failed. Container exited with code {exit_code}.\nLogs:\n{logs}"
-            )
-
-        # Successful validation (exit code 0)
-        log.info("Schema validation successful!")
-
-    except docker.errors.APIError as e:
-        # Handle container creation error
-        raise ValueError(f"Failed to create container: {e}")
-
-
 def get_llm_provider_access_keys(arch_config_file):
    with open(arch_config_file, "r") as file:
        arch_config = file.read()
@ -127,3 +71,23 @@ def load_env_file_to_dict(file_path):
                env_dict[key] = value

    return env_dict
+
+
+def stream_access_logs(follow):
+    """
+    Get the archgw access logs
+    """
+    log_file_pattern_expanded = os.path.expanduser(ACCESS_LOG_FILES)
+    log_files = glob.glob(log_file_pattern_expanded)
+
+    stream_command = ["tail"]
+    if follow:
+        stream_command.append("-f")
+
+    stream_command.extend(log_files)
+    subprocess.run(
+        stream_command,
+        check=True,
+        stdout=sys.stdout,
+        stderr=sys.stderr,
+    )
--- a/arch/tools/poetry.lock
+++ b/arch/tools/poetry.lock
--- a/arch/tools/pyproject.toml
+++ b/arch/tools/pyproject.toml
@ -1,6 +1,6 @@
 [tool.poetry]
 name = "archgw"
-version = "0.2.1"
+version = "0.2.6"
 description = "Python-based CLI tool to manage Arch Gateway."
 authors = ["Katanemo Labs, Inc."]
 packages = [
@ -10,13 +10,11 @@ readme = "README.md"

 [tool.poetry.dependencies]
 python = "^3.10"
-archgw_modelserver = "^0.2.1"
+archgw_modelserver = "^0.2.6"
 click = "^8.1.7"
 jinja2 = "^3.1.4"
 jsonschema = "^4.23.0"
 setuptools = "75.5.0"
-docker = "^7.1.0"
-python-dotenv = "^1.0.1"
 pyyaml = "^6.0.2"

 [tool.poetry.scripts]
--- a/arch/validate_arch_config.sh
+++ b/arch/validate_arch_config.sh
@ -0,0 +1,20 @@
+#!/bin/bash
+
+failed_files=()
+
+for file in $(find . -name arch_config.yaml -o -name arch_config_full_reference.yaml); do
+  echo "Validating $file..."
+  if ! docker run --rm -v "$(pwd)/$file:/app/arch_config.yaml:ro" --entrypoint /bin/sh katanemo/archgw:latest -c "python config_generator.py" 2>&1 > /dev/null ; then
+    echo "Validation failed for $file"
+    failed_files+=("$file")
+  fi
+done
+
+# Print summary of failed files
+if [ ${#failed_files[@]} -ne 0 ]; then
+  echo -e "\nValidation failed for the following files:"
+  printf '%s\n' "${failed_files[@]}"
+  exit 1
+else
+  echo -e "\nAll files validated successfully!"
+fi
--- a/archgw.code-workspace
+++ b/archgw.code-workspace
@ -31,6 +31,10 @@
    {
      "name": "chatbot_ui",
      "path": "demos/shared/chatbot_ui"
+    },
+    {
+      "name": "java_demo",
+      "path": "demos/samples_java/weather_forcecast_service"
    }
  ],
  "settings": {
--- a/crates/common/src/api/open_ai.rs
+++ b/crates/common/src/api/open_ai.rs
@ -135,7 +135,10 @@ impl From<String> for ParameterType {
            "array" => ParameterType::List,
            "dict" => ParameterType::Dict,
            "dictionary" => ParameterType::Dict,
-            _ => ParameterType::String,
+            _ => {
+                log::warn!("Unknown parameter type: {}, assuming type str", s);
+                ParameterType::String
+            }
        }
    }
 }
@ -186,7 +189,7 @@ pub struct ToolCall {
 #[derive(Debug, Clone, Serialize, Deserialize)]
 pub struct FunctionCallDetail {
    pub name: String,
-    pub arguments: HashMap<String, Value>,
+    pub arguments: Option<HashMap<String, Value>>,
 }

 #[derive(Debug, Deserialize, Serialize)]
@ -202,13 +205,6 @@ pub struct ToolCallState {
 pub enum ArchState {
    ToolCall(Vec<ToolCallState>),
 }
-#[derive(Deserialize, Serialize)]
-#[serde(untagged)]
-pub enum ModelServerResponse {
-    ChatCompletionsResponse(ChatCompletionsResponse),
-    ModelServerErrorResponse(ModelServerErrorResponse),
-}
-
 #[derive(Debug, Clone, Serialize, Deserialize)]
 pub struct ModelServerErrorResponse {
    pub result: String,
--- a/crates/common/src/configuration.rs
+++ b/crates/common/src/configuration.rs
@ -25,6 +25,7 @@ pub struct Configuration {
 pub struct Overrides {
    pub prompt_target_intent_matching_threshold: Option<f64>,
    pub optimize_context_window: Option<bool>,
+    pub use_agent_orchestrator: Option<bool>,
 }

 #[derive(Debug, Clone, Serialize, Deserialize, Default)]
@ -159,7 +160,7 @@ pub struct LlmProvider {
    pub name: String,
    pub provider_interface: LlmProviderType,
    pub access_key: Option<String>,
-    pub model: String,
+    pub model: Option<String>,
    pub default: Option<bool>,
    pub stream: Option<bool>,
    pub endpoint: Option<String>,
@ -326,16 +327,6 @@ mod test {
            Some("/agent/summary".to_string())
        );

-        let error_target = config.error_target.as_ref().unwrap();
-        assert_eq!(
-            error_target.endpoint.as_ref().unwrap().name,
-            "error_target_1".to_string()
-        );
-        assert_eq!(
-            error_target.endpoint.as_ref().unwrap().path,
-            Some("/error".to_string())
-        );
-
        let tracing = config.tracing.as_ref().unwrap();
        assert_eq!(tracing.sampling_rate.unwrap(), 0.1);

--- a/crates/common/src/consts.rs
+++ b/crates/common/src/consts.rs
@ -11,10 +11,13 @@ pub const MODEL_SERVER_NAME: &str = "model_server";
 pub const ARCH_ROUTING_HEADER: &str = "x-arch-llm-provider";
 pub const MESSAGES_KEY: &str = "messages";
 pub const ARCH_PROVIDER_HINT_HEADER: &str = "x-arch-llm-provider-hint";
-pub const CHAT_COMPLETIONS_PATH: &str = "/v1/chat/completions";
+pub const CHAT_COMPLETIONS_PATH: [&str; 2] = ["/v1/chat/completions", "/openai/v1/chat/completions"];
 pub const HEALTHZ_PATH: &str = "/healthz";
-pub const ARCH_STATE_HEADER: &str = "x-arch-state";
-pub const ARCH_FC_MODEL_NAME: &str = "Arch-Function-1.5B";
+pub const X_ARCH_STATE_HEADER: &str = "x-arch-state";
+pub const X_ARCH_API_RESPONSE: &str = "x-arch-api-response-message";
+pub const X_ARCH_TOOL_CALL: &str = "x-arch-tool-call-message";
+pub const X_ARCH_FC_MODEL_RESPONSE: &str = "x-arch-fc-model-response";
+pub const ARCH_FC_MODEL_NAME: &str = "Arch-Function";
 pub const REQUEST_ID_HEADER: &str = "x-request-id";
 pub const TRACE_PARENT_HEADER: &str = "traceparent";
 pub const ARCH_INTERNAL_CLUSTER_NAME: &str = "arch_internal";
--- a/crates/common/src/http.rs
+++ b/crates/common/src/http.rs
@ -3,7 +3,7 @@ use crate::{
    stats::{Gauge, IncrementingMetric},
 };
 use derivative::Derivative;
-use log::trace;
+use log::debug;
 use proxy_wasm::traits::Context;
 use serde::Serialize;
 use std::{cell::RefCell, collections::HashMap, fmt::Debug, time::Duration};
@ -48,10 +48,9 @@ pub trait Client: Context {
        call_args: CallArgs,
        call_context: Self::CallContext,
    ) -> Result<u32, ClientError> {
-        trace!(
+        debug!(
            "dispatching http call with args={:?} context={:?}",
-            call_args,
-            call_context
+            call_args, call_context
        );

        match self.dispatch_http_call(
--- a/crates/common/src/tokenizer.rs
+++ b/crates/common/src/tokenizer.rs
@ -1,19 +1,25 @@
-use log::trace;
-
-#[derive(thiserror::Error, Debug, PartialEq, Eq)]
-#[allow(dead_code)]
-pub enum Error {
-    #[error("Unknown model: {model_name}")]
-    UnknownModel { model_name: String },
-}
+use log::debug;

 #[allow(dead_code)]
-pub fn token_count(model_name: &str, text: &str) -> Result<usize, Error> {
-    trace!("getting token count model={}", model_name);
+pub fn token_count(model_name: &str, text: &str) -> Result<usize, String> {
+    debug!("getting token count model={}", model_name);
+    //HACK: add support for tokenizing mistral and other models
+    //filed issue https://github.com/katanemo/arch/issues/222
+
+    let updated_model = match model_name.starts_with("gpt") {
+        false => {
+            debug!(
+                "tiktoken_rs: unsupported model: {}, using gpt-4 to compute token count",
+                model_name
+            );
+
+            "gpt-4"
+        }
+        true => model_name,
+    };
+
    // Consideration: is it more expensive to instantiate the BPE object every time, or to contend the singleton?
-    let bpe = tiktoken_rs::get_bpe_from_model(model_name).map_err(|_| Error::UnknownModel {
-        model_name: model_name.to_string(),
-    })?;
+    let bpe = tiktoken_rs::get_bpe_from_model(updated_model).map_err(|e| e.to_string())?;
    Ok(bpe.encode_ordinary(text).len())
 }

@ -30,14 +36,4 @@ mod test {
            token_count(model_name, text).expect("correct tokenization")
        );
    }
-
-    #[test]
-    fn unrecognized_model() {
-        assert_eq!(
-            Error::UnknownModel {
-                model_name: "unknown".to_string()
-            },
-            token_count("unknown", "").expect_err("unknown model")
-        )
-    }
 }
--- a/crates/common/src/tracing.rs
+++ b/crates/common/src/tracing.rs
@ -166,7 +166,7 @@ impl TraceData {
                attributes: vec![Attribute {
                    key: "service.name".to_string(),
                    value: AttributeValue {
-                        string_value: Some("upstream-llm".to_string()),
+                        string_value: Some("egress_llm_traffic".to_string()),
                    },
                }],
            };
--- a/crates/llm_gateway/src/filter_context.rs
+++ b/crates/llm_gateway/src/filter_context.rs
@ -1,6 +1,7 @@
 use crate::metrics::Metrics;
 use crate::stream_context::StreamContext;
 use common::configuration::Configuration;
+use common::configuration::Overrides;
 use common::consts::OTEL_COLLECTOR_HTTP;
 use common::consts::OTEL_POST_PATH;
 use common::http::CallArgs;
@ -31,6 +32,7 @@ pub struct FilterContext {
    callouts: RefCell<HashMap<u32, CallContext>>,
    llm_providers: Option<Rc<LlmProviders>>,
    traces_queue: Arc<Mutex<VecDeque<TraceData>>>,
+    overrides: Rc<Option<Overrides>>,
 }

 impl FilterContext {
@ -40,6 +42,7 @@ impl FilterContext {
            metrics: Rc::new(Metrics::new()),
            llm_providers: None,
            traces_queue: Arc::new(Mutex::new(VecDeque::new())),
+            overrides: Rc::new(None),
        }
    }
 }
@ -69,6 +72,7 @@ impl RootContext for FilterContext {
        };

        ratelimit::ratelimits(Some(config.ratelimits.unwrap_or_default()));
+        self.overrides = Rc::new(config.overrides);

        match config.llm_providers.try_into() {
            Ok(llm_providers) => self.llm_providers = Some(Rc::new(llm_providers)),
@ -93,6 +97,7 @@ impl RootContext for FilterContext {
                    .expect("LLM Providers must exist when Streams are being created"),
            ),
            Arc::clone(&self.traces_queue),
+            Rc::clone(&self.overrides),
        )))
    }

--- a/crates/llm_gateway/src/stream_context.rs
+++ b/crates/llm_gateway/src/stream_context.rs
@ -3,9 +3,9 @@ use common::api::open_ai::{
    ChatCompletionStreamResponseServerEvents, ChatCompletionsRequest, ChatCompletionsResponse,
    Message, StreamOptions,
 };
-use common::configuration::LlmProvider;
+use common::configuration::{LlmProvider, LlmProviderType, Overrides};
 use common::consts::{
-    ARCH_PROVIDER_HINT_HEADER, ARCH_ROUTING_HEADER, CHAT_COMPLETIONS_PATH,
+    ARCH_PROVIDER_HINT_HEADER, ARCH_ROUTING_HEADER, CHAT_COMPLETIONS_PATH, HEALTHZ_PATH,
    RATELIMIT_SELECTOR_HEADER_KEY, REQUEST_ID_HEADER, TRACE_PARENT_HEADER,
 };
 use common::errors::ServerError;
@ -15,7 +15,7 @@ use common::stats::{IncrementingMetric, RecordingMetric};
 use common::tracing::{Event, Span, TraceData, Traceparent};
 use common::{ratelimit, routing, tokenizer};
 use http::StatusCode;
-use log::{debug, trace, warn};
+use log::{debug, info, warn};
 use proxy_wasm::hostcalls::get_current_time;
 use proxy_wasm::traits::*;
 use proxy_wasm::types::*;
@ -42,6 +42,7 @@ pub struct StreamContext {
    request_body_sent_time: Option<u128>,
    user_message: Option<Message>,
    traces_queue: Arc<Mutex<VecDeque<TraceData>>>,
+    overrides: Rc<Option<Overrides>>,
 }

 impl StreamContext {
@ -50,10 +51,12 @@ impl StreamContext {
        metrics: Rc<Metrics>,
        llm_providers: Rc<LlmProviders>,
        traces_queue: Arc<Mutex<VecDeque<TraceData>>>,
+        overrides: Rc<Option<Overrides>>,
    ) -> Self {
        StreamContext {
            context_id,
            metrics,
+            overrides,
            ratelimit_selector: None,
            streaming_response: false,
            response_tokens: 0,
@ -86,10 +89,34 @@ impl StreamContext {
            provider_hint,
        ));

+        // Check if we need to modify the path based on the provider's base_url
+        let needs_openai_prefix = self
+            .llm_provider
+            .as_ref()
+            .and_then(|provider| provider.endpoint.as_ref())
+            .map(|url| url.contains("api.groq.com"))
+            .unwrap_or(false);
+
+        if needs_openai_prefix {
+            if let Some(path) = self.get_http_request_header(":path") {
+                if path.starts_with("/v1/") {
+                    let new_path = format!("/openai{}", path);
+                    self.set_http_request_header(":path", Some(new_path.as_str()));
+                }
+            }
+        }
+
        debug!(
-            "request received: llm provider hint: {:?}, selected llm: {}",
-            self.get_http_request_header(ARCH_PROVIDER_HINT_HEADER),
-            self.llm_provider.as_ref().unwrap().name
+            "request received: llm provider hint: {}, selected llm: {}, model: {}",
+            self.get_http_request_header(ARCH_PROVIDER_HINT_HEADER)
+                .unwrap_or_default(),
+            self.llm_provider.as_ref().unwrap().name,
+            self.llm_provider
+                .as_ref()
+                .unwrap()
+                .model
+                .as_ref()
+                .unwrap_or(&String::new())
        );
    }

@ -130,7 +157,7 @@ impl StreamContext {
    }

    fn send_server_error(&self, error: ServerError, override_status_code: Option<StatusCode>) {
-        debug!("server error occurred: {}", error);
+        warn!("server error occurred: {}", error);
        self.send_http_response(
            override_status_code
                .unwrap_or(StatusCode::INTERNAL_SERVER_ERROR)
@ -149,11 +176,11 @@ impl StreamContext {
        // Tokenize and record token count.
        let token_count = tokenizer::token_count(model, json_string).unwrap_or(0);

+        debug!("Recorded input token count: {}", token_count);
        // Record the token count to metrics.
        self.metrics
            .input_sequence_length
            .record(token_count as u64);
-        trace!("Recorded input token count: {}", token_count);

        // Check if rate limiting needs to be applied.
        if let Some(selector) = self.ratelimit_selector.take() {
@ -164,7 +191,7 @@ impl StreamContext {
                NonZero::new(token_count as u32).unwrap(),
            )?;
        } else {
-            trace!("No rate limit applied for model: {}", model);
+            debug!("No rate limit applied for model: {}", model);
        }

        Ok(())
@ -176,29 +203,59 @@ impl HttpContext for StreamContext {
    // Envoy's HTTP model is event driven. The WASM ABI has given implementors events to hook onto
    // the lifecycle of the http request and response.
    fn on_http_request_headers(&mut self, _num_headers: usize, _end_of_stream: bool) -> Action {
-        self.select_llm_provider();
-
-        // if endpoint is not set then use provider name as routing header so envoy can resolve the cluster name
-        if self.llm_provider().endpoint.is_none() {
-            self.add_http_request_header(
-                ARCH_ROUTING_HEADER,
-                &self.llm_provider().provider_interface.to_string(),
-            );
-        } else {
-            self.add_http_request_header(ARCH_ROUTING_HEADER, &self.llm_provider().name);
+        let request_path = self.get_http_request_header(":path").unwrap_or_default();
+        if request_path == HEALTHZ_PATH {
+            self.send_http_response(200, vec![], None);
+            return Action::Continue;
        }

-        if let Err(error) = self.modify_auth_headers() {
-            // ensure that the provider has an endpoint if the access key is missing else return a bad request
-            if self.llm_provider.as_ref().unwrap().endpoint.is_none() {
-                self.send_server_error(error, Some(StatusCode::BAD_REQUEST));
+        let routing_header_value = self.get_http_request_header(ARCH_ROUTING_HEADER);
+
+        let use_agent_orchestrator = match self.overrides.as_ref() {
+            Some(overrides) => overrides.use_agent_orchestrator.unwrap_or_default(),
+            None => false,
+        };
+
+        if let Some(routing_header_value) = routing_header_value.as_ref() {
+            info!("routing header already set: {}", routing_header_value);
+            self.llm_provider = Some(Rc::new(LlmProvider {
+                name: routing_header_value.to_string(),
+                provider_interface: LlmProviderType::OpenAI,
+                access_key: None,
+                endpoint: None,
+                model: None,
+                default: None,
+                stream: None,
+                port: None,
+                rate_limits: None,
+            }));
+        } else {
+            self.select_llm_provider();
+            if self.llm_provider().endpoint.is_some() {
+                self.add_http_request_header(
+                    ARCH_ROUTING_HEADER,
+                    &self.llm_provider().name.to_string(),
+                );
+            } else {
+                self.add_http_request_header(
+                    ARCH_ROUTING_HEADER,
+                    &self.llm_provider().provider_interface.to_string(),
+                );
+            }
+            if let Err(error) = self.modify_auth_headers() {
+                // ensure that the provider has an endpoint if the access key is missing else return a bad request
+                if self.llm_provider.as_ref().unwrap().endpoint.is_none() && !use_agent_orchestrator
+                {
+                    self.send_server_error(error, Some(StatusCode::BAD_REQUEST));
+                }
            }
        }
+
        self.delete_content_length_header();
        self.save_ratelimit_header();

-        self.is_chat_completions_request =
-            self.get_http_request_header(":path").unwrap_or_default() == CHAT_COMPLETIONS_PATH;
+        let request_path = self.get_http_request_header(":path").unwrap_or_default();
+        self.is_chat_completions_request = CHAT_COMPLETIONS_PATH.contains(&request_path.as_str());

        self.request_id = self.get_http_request_header(REQUEST_ID_HEADER);
        self.traceparent = self.get_http_request_header(TRACE_PARENT_HEADER);
@ -207,6 +264,11 @@ impl HttpContext for StreamContext {
    }

    fn on_http_request_body(&mut self, body_size: usize, end_of_stream: bool) -> Action {
+        debug!(
+            "on_http_request_body [S={}] bytes={} end_stream={}",
+            self.context_id, body_size, end_of_stream
+        );
+
        // Let the client send the gateway all the data before sending to the LLM_provider.
        // TODO: consider a streaming API.

@ -222,34 +284,41 @@ impl HttpContext for StreamContext {
            return Action::Continue;
        }

+        let body_bytes = match self.get_http_request_body(0, body_size) {
+            Some(body_bytes) => body_bytes,
+            None => {
+                self.send_server_error(
+                    ServerError::LogicError(format!(
+                        "Failed to obtain body bytes even though body_size is {}",
+                        body_size
+                    )),
+                    None,
+                );
+                return Action::Pause;
+            }
+        };
+
        // Deserialize body into spec.
        // Currently OpenAI API.
        let mut deserialized_body: ChatCompletionsRequest =
-            match self.get_http_request_body(0, body_size) {
-                Some(body_bytes) => match serde_json::from_slice(&body_bytes) {
-                    Ok(deserialized) => deserialized,
-                    Err(e) => {
-                        self.send_server_error(
-                            ServerError::Deserialization(e),
-                            Some(StatusCode::BAD_REQUEST),
-                        );
-                        return Action::Pause;
-                    }
-                },
-                None => {
+            match serde_json::from_slice(&body_bytes) {
+                Ok(deserialized) => deserialized,
+                Err(e) => {
+                    debug!(
+                        "on_http_request_body: request body: {}",
+                        String::from_utf8_lossy(&body_bytes)
+                    );
                    self.send_server_error(
-                        ServerError::LogicError(format!(
-                            "Failed to obtain body bytes even though body_size is {}",
-                            body_size
-                        )),
-                        None,
+                        ServerError::Deserialization(e),
+                        Some(StatusCode::BAD_REQUEST),
                    );
                    return Action::Pause;
                }
            };

        // remove metadata from the request body
-        deserialized_body.metadata = None;
+        //TODO: move this to prompt gateway
+        // deserialized_body.metadata = None;
        // delete model key from message array
        for message in deserialized_body.messages.iter_mut() {
            message.model = None;
@ -262,15 +331,47 @@ impl HttpContext for StreamContext {
            .last()
            .cloned();

-        // override model name from the llm provider
-        deserialized_body
-            .model
-            .clone_from(&self.llm_provider.as_ref().unwrap().model);
+        let model_name = match self.llm_provider.as_ref() {
+            Some(llm_provider) => llm_provider.model.as_ref(),
+            None => None,
+        };
+
+        let use_agent_orchestrator = match self.overrides.as_ref() {
+            Some(overrides) => overrides.use_agent_orchestrator.unwrap_or_default(),
+            None => false,
+        };
+
+        let model_requested = deserialized_body.model.clone();
+        if deserialized_body.model.is_empty() || deserialized_body.model.to_lowercase() == "none" {
+            deserialized_body.model = match model_name {
+                Some(model_name) => model_name.clone(),
+                None => {
+                    if use_agent_orchestrator {
+                        "agent_orchestrator".to_string()
+                    } else {
+                        self.send_server_error(
+                          ServerError::BadRequest {
+                              why: format!("No model specified in request and couldn't determine model name from arch_config. Model name in req: {}, arch_config, provider: {}, model: {:?}", deserialized_body.model, self.llm_provider().name, self.llm_provider().model).to_string(),
+                          },
+                          Some(StatusCode::BAD_REQUEST),
+                      );
+                        return Action::Continue;
+                    }
+                }
+            }
+        }
+
+        info!(
+            "on_http_request_body: provider: {}, model requested: {}, model selected: {}",
+            self.llm_provider().name,
+            model_requested,
+            model_name.unwrap_or(&"None".to_string()),
+        );
+
        let chat_completion_request_str = serde_json::to_string(&deserialized_body).unwrap();

-        trace!(
-            "arch => {:?}, body: {}",
-            deserialized_body.model,
+        debug!(
+            "on_http_request_body: request body: {}",
            chat_completion_request_str
        );

@ -307,10 +408,9 @@ impl HttpContext for StreamContext {
    }

    fn on_http_response_headers(&mut self, _num_headers: usize, _end_of_stream: bool) -> Action {
-        trace!(
+        debug!(
            "on_http_response_headers [S={}] end_stream={}",
-            self.context_id,
-            _end_of_stream
+            self.context_id, _end_of_stream
        );

        self.set_property(
@ -322,15 +422,18 @@ impl HttpContext for StreamContext {
    }

    fn on_http_response_body(&mut self, body_size: usize, end_of_stream: bool) -> Action {
-        trace!(
+        debug!(
            "on_http_response_body [S={}] bytes={} end_stream={}",
-            self.context_id,
-            body_size,
-            end_of_stream
+            self.context_id, body_size, end_of_stream
        );

+        if self.request_body_sent_time.is_none() {
+            debug!("on_http_response_body: request body not sent, not doing any processing in llm filter");
+            return Action::Continue;
+        }
+
        if !self.is_chat_completions_request {
-            debug!("non-chatcompletion request");
+            info!("on_http_response_body: non-chatcompletion request");
            return Action::Continue;
        }

@ -342,7 +445,7 @@ impl HttpContext for StreamContext {
                Ok(duration) => {
                    // Convert the duration to milliseconds
                    let duration_ms = duration.as_millis();
-                    debug!("request latency: {}ms", duration_ms);
+                    info!("on_http_response_body: request latency: {}ms", duration_ms);
                    // Record the latency to the latency histogram
                    self.metrics.request_latency.record(duration_ms as u64);

@ -353,7 +456,7 @@ impl HttpContext for StreamContext {
                        // Record the time per output token
                        self.metrics.time_per_output_token.record(tpot);

-                        trace!(
+                        debug!(
                            "time per token: {}ms, tokens per second: {}",
                            tpot,
                            1000 / tpot
@ -381,7 +484,7 @@ impl HttpContext for StreamContext {
                    Ok(traceparent) => {
                        let mut trace_data = common::tracing::TraceData::new();
                        let mut llm_span = Span::new(
-                            "upstream_llm_time".to_string(),
+                            "egress_traffic".to_string(),
                            Some(traceparent.trace_id),
                            Some(traceparent.parent_id),
                            self.request_body_sent_time.unwrap(),
@ -417,10 +520,9 @@ impl HttpContext for StreamContext {
        let body = if self.streaming_response {
            let chunk_start = 0;
            let chunk_size = body_size;
-            trace!(
-                "streaming response reading, {}..{}",
-                chunk_start,
-                chunk_size
+            debug!(
+                "on_http_response_body: streaming response reading, {}..{}",
+                chunk_start, chunk_size
            );
            let streaming_chunk = match self.get_http_response_body(0, chunk_size) {
                Some(chunk) => chunk,
@ -442,7 +544,7 @@ impl HttpContext for StreamContext {
            }
            streaming_chunk
        } else {
-            trace!("non streaming response bytes read: 0:{}", body_size);
+            debug!("non streaming response bytes read: 0:{}", body_size);
            match self.get_http_response_body(0, body_size) {
                Some(body) => body,
                None => {
@ -455,17 +557,21 @@ impl HttpContext for StreamContext {
        let body_utf8 = match String::from_utf8(body) {
            Ok(body_utf8) => body_utf8,
            Err(e) => {
-                debug!("could not convert to utf8: {}", e);
+                warn!("could not convert to utf8: {}", e);
                return Action::Continue;
            }
        };

        if self.streaming_response {
+            if body_utf8 == "data: [DONE]\n" {
+                return Action::Continue;
+            }
+
            let chat_completions_chunk_response_events =
                match ChatCompletionStreamResponseServerEvents::try_from(body_utf8.as_str()) {
                    Ok(response) => response,
                    Err(e) => {
-                        debug!(
+                        warn!(
                            "invalid streaming response: body str: {}, {:?}",
                            body_utf8, e
                        );
@ -474,33 +580,27 @@ impl HttpContext for StreamContext {
                };

            if chat_completions_chunk_response_events.events.is_empty() {
-                debug!("empty streaming response");
+                warn!(
+                    "couldn't parse any streaming events: body str: {}",
+                    body_utf8
+                );
                return Action::Continue;
            }

-            let mut model = chat_completions_chunk_response_events
+            let model = chat_completions_chunk_response_events
                .events
                .first()
                .unwrap()
                .model
                .clone();
            let tokens_str = chat_completions_chunk_response_events.to_string();
-            //HACK: add support for tokenizing mistral and other models
-            //filed issue https://github.com/katanemo/arch/issues/222
-            if !model.as_ref().unwrap().starts_with("gpt") {
-                warn!(
-                    "tiktoken_rs: unsupported model: {}, using gpt-4 to compute token count",
-                    model.as_ref().unwrap()
-                );
-            }
-            model = Some("gpt-4".to_string());

            let token_count =
                match tokenizer::token_count(model.as_ref().unwrap().as_str(), tokens_str.as_str())
                {
                    Ok(token_count) => token_count,
                    Err(e) => {
-                        debug!("could not get token count: {:?}", e);
+                        warn!("could not get token count: {:?}", e);
                        return Action::Continue;
                    }
                };
@ -514,7 +614,10 @@ impl HttpContext for StreamContext {
                match current_time.duration_since(self.start_time) {
                    Ok(duration) => {
                        let duration_ms = duration.as_millis();
-                        debug!("time to first token: {}ms", duration_ms);
+                        info!(
+                            "on_http_response_body: time to first token: {}ms",
+                            duration_ms
+                        );
                        self.ttft_duration = Some(duration);
                        self.metrics.time_to_first_token.record(duration_ms as u64);
                    }
@ -524,12 +627,12 @@ impl HttpContext for StreamContext {
                }
            }
        } else {
-            trace!("non streaming response");
+            debug!("non streaming response");
            let chat_completions_response: ChatCompletionsResponse =
                match serde_json::from_str(body_utf8.as_str()) {
                    Ok(de) => de,
                    Err(err) => {
-                        debug!(
+                        info!(
                            "non chat-completion compliant response received err: {}, body: {}",
                            err, body_utf8
                        );
@ -546,11 +649,9 @@ impl HttpContext for StreamContext {
            }
        }

-        trace!(
+        debug!(
            "recv [S={}] total_tokens={} end_stream={}",
-            self.context_id,
-            self.response_tokens,
-            end_of_stream
+            self.context_id, self.response_tokens, end_of_stream
        );

        Action::Continue
--- a/crates/llm_gateway/tests/integration.rs
+++ b/crates/llm_gateway/tests/integration.rs
@ -18,12 +18,19 @@ fn wasm_module() -> String {
 fn request_headers_expectations(module: &mut Tester, http_context: i32) {
    module
        .call_proxy_on_request_headers(http_context, 0, false)
+        .expect_get_header_map_value(Some(MapType::HttpRequestHeaders), Some(":path"))
+        .returning(Some("/v1/chat/completions"))
+        .expect_get_header_map_value(
+            Some(MapType::HttpRequestHeaders),
+            Some("x-arch-llm-provider"),
+        )
+        .returning(None)
        .expect_get_header_map_value(
            Some(MapType::HttpRequestHeaders),
            Some("x-arch-llm-provider-hint"),
        )
        .returning(None)
-        .expect_log(Some(LogLevel::Debug), Some("request received: llm provider hint: Some(\"default\"), selected llm: open-ai-gpt-4"))
+        .expect_log(Some(LogLevel::Debug), Some("request received: llm provider hint: default, selected llm: open-ai-gpt-4, model: gpt-4"))
        .expect_add_header_map_value(
            Some(MapType::HttpRequestHeaders),
            Some("x-arch-llm-provider"),
@ -34,6 +41,7 @@ fn request_headers_expectations(module: &mut Tester, http_context: i32) {
            Some("Authorization"),
            Some("Bearer secret_key"),
        )
+        .expect_remove_header_map_value(Some(MapType::HttpRequestHeaders), Some("content-length"))
        .expect_get_header_map_value(
            Some(MapType::HttpRequestHeaders),
            Some("x-arch-llm-provider-hint"),
@ -46,8 +54,6 @@ fn request_headers_expectations(module: &mut Tester, http_context: i32) {
        .returning(Some("selector-key"))
        .expect_get_header_map_value(Some(MapType::HttpRequestHeaders), Some("selector-key"))
        .returning(Some("selector-value"))
-        .expect_get_header_map_pairs(Some(MapType::HttpRequestHeaders))
-        .returning(None)
        .expect_get_header_map_value(Some(MapType::HttpRequestHeaders), Some(":path"))
        .returning(Some("/v1/chat/completions"))
        .expect_get_header_map_value(Some(MapType::HttpRequestHeaders), Some("x-request-id"))
@ -217,12 +223,14 @@ fn llm_gateway_successful_request_to_open_ai_chat_completions() {
            chat_completions_request_body.len() as i32,
            true,
        )
+        .expect_log(Some(LogLevel::Debug), None)
        .expect_get_buffer_bytes(Some(BufferType::HttpRequestBody))
        .returning(Some(chat_completions_request_body))
-        .expect_log(Some(LogLevel::Trace), None)
-        .expect_log(Some(LogLevel::Trace), None)
+        .expect_log(Some(LogLevel::Info), None)
+        .expect_log(Some(LogLevel::Debug), None)
+        .expect_log(Some(LogLevel::Debug), None)
+        .expect_log(Some(LogLevel::Debug), None)
        .expect_metric_record("input_sequence_length", 21)
-        .expect_log(Some(LogLevel::Trace), None)
        .expect_log(Some(LogLevel::Debug), None)
        .expect_log(Some(LogLevel::Debug), None)
        .expect_set_buffer_bytes(Some(BufferType::HttpRequestBody), None)
@ -264,7 +272,7 @@ fn llm_gateway_bad_request_to_open_ai_chat_completions() {
    {\
        \"messages\": [\
        {\
-            \"role\": \"system\",\
+            \"role\": \"system\"\
        },\
        {\
            \"role\": \"user\",\
@ -282,13 +290,20 @@ fn llm_gateway_bad_request_to_open_ai_chat_completions() {
        .expect_get_buffer_bytes(Some(BufferType::HttpRequestBody))
        .returning(Some(incomplete_chat_completions_request_body))
        .expect_log(Some(LogLevel::Debug), None)
+        .expect_log(Some(LogLevel::Info), Some("on_http_request_body: provider: open-ai-gpt-4, model requested: , model selected: gpt-4"))
        .expect_send_local_response(
            Some(StatusCode::BAD_REQUEST.as_u16().into()),
            None,
            None,
            None,
        )
-        .execute_and_expect(ReturnType::Action(Action::Pause))
+        .expect_log(Some(LogLevel::Debug), None)
+        .expect_log(Some(LogLevel::Debug), None)
+        .expect_log(Some(LogLevel::Debug), None)
+        .expect_metric_record("input_sequence_length", 14)
+        .expect_log(Some(LogLevel::Debug), None)
+        .expect_log(Some(LogLevel::Debug), None)
+        .execute_and_expect(ReturnType::Action(Action::Continue))
        .unwrap();
 }

@ -337,16 +352,18 @@ fn llm_gateway_request_ratelimited() {
            chat_completions_request_body.len() as i32,
            true,
        )
+        .expect_log(Some(LogLevel::Debug), None)
        .expect_get_buffer_bytes(Some(BufferType::HttpRequestBody))
        .returning(Some(chat_completions_request_body))
        // The actual call is not important in this test, we just need to grab the token_id
-        .expect_log(Some(LogLevel::Trace), None)
-        .expect_log(Some(LogLevel::Trace), None)
+        .expect_log(Some(LogLevel::Info), None)
+        .expect_log(Some(LogLevel::Debug), None)
+        .expect_log(Some(LogLevel::Debug), None)
+        .expect_log(Some(LogLevel::Debug), None)
        .expect_metric_record("input_sequence_length", 107)
-        .expect_log(Some(LogLevel::Trace), None)
-        .expect_log(Some(LogLevel::Debug), None)
        .expect_log(Some(LogLevel::Debug), None)
        .expect_log(Some(LogLevel::Debug), None)
+        .expect_log(Some(LogLevel::Warn), Some("server error occurred: exceeded limit provider=gpt-4, selector=Header { key: \"selector-key\", value: \"selector-value\" }, tokens_used=107"))
        .expect_send_local_response(
            Some(StatusCode::TOO_MANY_REQUESTS.as_u16().into()),
            None,
@ -403,13 +420,201 @@ fn llm_gateway_request_not_ratelimited() {
            chat_completions_request_body.len() as i32,
            true,
        )
+        .expect_log(Some(LogLevel::Debug), None)
        .expect_get_buffer_bytes(Some(BufferType::HttpRequestBody))
        .returning(Some(chat_completions_request_body))
        // The actual call is not important in this test, we just need to grab the token_id
-        .expect_log(Some(LogLevel::Trace), None)
-        .expect_log(Some(LogLevel::Trace), None)
+        .expect_log(Some(LogLevel::Info), None)
+        .expect_log(Some(LogLevel::Debug), None)
+        .expect_log(Some(LogLevel::Debug), None)
+        .expect_log(Some(LogLevel::Debug), None)
        .expect_metric_record("input_sequence_length", 29)
-        .expect_log(Some(LogLevel::Trace), None)
+        .expect_log(Some(LogLevel::Debug), None)
+        .expect_log(Some(LogLevel::Debug), None)
+        .expect_set_buffer_bytes(Some(BufferType::HttpRequestBody), None)
+        .execute_and_expect(ReturnType::Action(Action::Continue))
+        .unwrap();
+}
+
+#[test]
+#[serial]
+fn llm_gateway_override_model_name() {
+    let args = tester::MockSettings {
+        wasm_path: wasm_module(),
+        quiet: false,
+        allow_unexpected: false,
+    };
+    let mut module = tester::mock(args).unwrap();
+
+    module
+        .call_start()
+        .execute_and_expect(ReturnType::None)
+        .unwrap();
+
+    // Setup Filter
+    let filter_context = setup_filter(&mut module, default_config());
+
+    // Setup HTTP Stream
+    let http_context = 2;
+
+    normal_flow(&mut module, filter_context, http_context);
+
+    // give shorter body to avoid rate limiting
+    let chat_completions_request_body = "\
+{\
+    \"model\": \"o1-mini\",\
+    \"messages\": [\
+    {\
+        \"role\": \"system\",\
+        \"content\": \"You are a poetic assistant, skilled in explaining complex programming concepts with creative flair.\"\
+    },\
+    {\
+        \"role\": \"user\",\
+        \"content\": \"Compose a poem that explains the concept of recursion in programming.\"\
+    }\
+    ]
+}";
+
+    module
+        .call_proxy_on_request_body(
+            http_context,
+            chat_completions_request_body.len() as i32,
+            true,
+        )
+        .expect_get_buffer_bytes(Some(BufferType::HttpRequestBody))
+        .returning(Some(chat_completions_request_body))
+        // The actual call is not important in this test, we just need to grab the token_id
+        .expect_log(Some(LogLevel::Debug), None)
+        .expect_log(Some(LogLevel::Info), Some("on_http_request_body: provider: open-ai-gpt-4, model requested: o1-mini, model selected: gpt-4"))
+        .expect_log(Some(LogLevel::Debug), None)
+        .expect_log(Some(LogLevel::Debug), None)
+        .expect_log(Some(LogLevel::Debug), None)
+        .expect_log(Some(LogLevel::Debug), None)
+        .expect_metric_record("input_sequence_length", 29)
+        .expect_log(Some(LogLevel::Debug), None)
+        .expect_log(Some(LogLevel::Debug), None)
+        .expect_set_buffer_bytes(Some(BufferType::HttpRequestBody), None)
+        .execute_and_expect(ReturnType::Action(Action::Continue))
+        .unwrap();
+}
+
+#[test]
+#[serial]
+fn llm_gateway_override_use_default_model() {
+    let args = tester::MockSettings {
+        wasm_path: wasm_module(),
+        quiet: false,
+        allow_unexpected: false,
+    };
+    let mut module = tester::mock(args).unwrap();
+
+    module
+        .call_start()
+        .execute_and_expect(ReturnType::None)
+        .unwrap();
+
+    // Setup Filter
+    let filter_context = setup_filter(&mut module, default_config());
+
+    // Setup HTTP Stream
+    let http_context = 2;
+
+    normal_flow(&mut module, filter_context, http_context);
+
+    // give shorter body to avoid rate limiting
+    let chat_completions_request_body = "\
+{\
+    \"messages\": [\
+    {\
+        \"role\": \"system\",\
+        \"content\": \"You are a poetic assistant, skilled in explaining complex programming concepts with creative flair.\"\
+    },\
+    {\
+        \"role\": \"user\",\
+        \"content\": \"Compose a poem that explains the concept of recursion in programming.\"\
+    }\
+    ]
+}";
+
+    module
+        .call_proxy_on_request_body(
+            http_context,
+            chat_completions_request_body.len() as i32,
+            true,
+        )
+        .expect_log(Some(LogLevel::Debug), None)
+        .expect_get_buffer_bytes(Some(BufferType::HttpRequestBody))
+        .returning(Some(chat_completions_request_body))
+        // The actual call is not important in this test, we just need to grab the token_id
+        .expect_log(
+            Some(LogLevel::Info),
+            Some("on_http_request_body: provider: open-ai-gpt-4, model requested: , model selected: gpt-4"),
+        )
+        .expect_log(Some(LogLevel::Debug), None)
+        .expect_log(Some(LogLevel::Debug), None)
+        .expect_log(Some(LogLevel::Debug), None)
+        .expect_metric_record("input_sequence_length", 29)
+        .expect_log(Some(LogLevel::Debug), None)
+        .expect_log(Some(LogLevel::Debug), None)
+        .expect_set_buffer_bytes(Some(BufferType::HttpRequestBody), None)
+        .execute_and_expect(ReturnType::Action(Action::Continue))
+        .unwrap();
+}
+
+#[test]
+#[serial]
+fn llm_gateway_override_use_model_name_none() {
+    let args = tester::MockSettings {
+        wasm_path: wasm_module(),
+        quiet: false,
+        allow_unexpected: false,
+    };
+    let mut module = tester::mock(args).unwrap();
+
+    module
+        .call_start()
+        .execute_and_expect(ReturnType::None)
+        .unwrap();
+
+    // Setup Filter
+    let filter_context = setup_filter(&mut module, default_config());
+
+    // Setup HTTP Stream
+    let http_context = 2;
+
+    normal_flow(&mut module, filter_context, http_context);
+
+    // give shorter body to avoid rate limiting
+    let chat_completions_request_body = "\
+{\
+    \"model\": \"none\",\
+    \"messages\": [\
+    {\
+        \"role\": \"system\",\
+        \"content\": \"You are a poetic assistant, skilled in explaining complex programming concepts with creative flair.\"\
+    },\
+    {\
+        \"role\": \"user\",\
+        \"content\": \"Compose a poem that explains the concept of recursion in programming.\"\
+    }\
+    ]
+}";
+
+    module
+        .call_proxy_on_request_body(
+            http_context,
+            chat_completions_request_body.len() as i32,
+            true,
+        )
+        .expect_get_buffer_bytes(Some(BufferType::HttpRequestBody))
+        .returning(Some(chat_completions_request_body))
+        // The actual call is not important in this test, we just need to grab the token_id
+        .expect_log(Some(LogLevel::Debug), None)
+        .expect_log(Some(LogLevel::Info), Some("on_http_request_body: provider: open-ai-gpt-4, model requested: none, model selected: gpt-4"))
+        .expect_log(Some(LogLevel::Debug), None)
+        .expect_log(Some(LogLevel::Debug), None)
+        .expect_metric_record("input_sequence_length", 29)
+        .expect_log(Some(LogLevel::Debug), None)
        .expect_log(Some(LogLevel::Debug), None)
        .expect_log(Some(LogLevel::Debug), None)
        .expect_set_buffer_bytes(Some(BufferType::HttpRequestBody), None)
--- a/crates/prompt_gateway/src/filter_context.rs
+++ b/crates/prompt_gateway/src/filter_context.rs
@ -1,6 +1,8 @@
 use crate::metrics::Metrics;
 use crate::stream_context::StreamContext;
-use common::configuration::{Configuration, Overrides, PromptGuards, PromptTarget, Tracing};
+use common::configuration::{
+    Configuration, Endpoint, Overrides, PromptGuards, PromptTarget, Tracing,
+};
 use common::http::Client;
 use common::stats::Gauge;
 use log::trace;
@ -21,6 +23,7 @@ pub struct FilterContext {
    overrides: Rc<Option<Overrides>>,
    system_prompt: Rc<Option<String>>,
    prompt_targets: Rc<HashMap<String, PromptTarget>>,
+    endpoints: Rc<Option<HashMap<String, Endpoint>>>,
    prompt_guards: Rc<PromptGuards>,
    tracing: Rc<Option<Tracing>>,
 }
@ -34,6 +37,7 @@ impl FilterContext {
            prompt_targets: Rc::new(HashMap::new()),
            overrides: Rc::new(None),
            prompt_guards: Rc::new(PromptGuards::default()),
+            endpoints: Rc::new(None),
            tracing: Rc::new(None),
        }
    }
@ -73,6 +77,7 @@ impl RootContext for FilterContext {
        }
        self.system_prompt = Rc::new(config.system_prompt);
        self.prompt_targets = Rc::new(prompt_targets);
+        self.endpoints = Rc::new(config.endpoints);

        if let Some(prompt_guards) = config.prompt_guards {
            self.prompt_guards = Rc::new(prompt_guards)
@ -94,6 +99,7 @@ impl RootContext for FilterContext {
            Rc::clone(&self.metrics),
            Rc::clone(&self.system_prompt),
            Rc::clone(&self.prompt_targets),
+            Rc::clone(&self.endpoints),
            Rc::clone(&self.overrides),
            Rc::clone(&self.tracing),
        )))
--- a/crates/prompt_gateway/src/http_context.rs
+++ b/crates/prompt_gateway/src/http_context.rs
@ -4,17 +4,18 @@ use common::{
        self, ArchState, ChatCompletionStreamResponse, ChatCompletionTool, ChatCompletionsRequest,
    },
    consts::{
-        ARCH_FC_MODEL_NAME, ARCH_INTERNAL_CLUSTER_NAME, ARCH_STATE_HEADER,
+        ARCH_FC_MODEL_NAME, ARCH_INTERNAL_CLUSTER_NAME, ARCH_ROUTING_HEADER,
        ARCH_UPSTREAM_HOST_HEADER, ASSISTANT_ROLE, CHAT_COMPLETIONS_PATH, HEALTHZ_PATH,
        MODEL_SERVER_NAME, MODEL_SERVER_REQUEST_TIMEOUT_MS, REQUEST_ID_HEADER, TOOL_ROLE,
-        TRACE_PARENT_HEADER, USER_ROLE,
+        TRACE_PARENT_HEADER, USER_ROLE, X_ARCH_API_RESPONSE, X_ARCH_FC_MODEL_RESPONSE,
+        X_ARCH_STATE_HEADER, X_ARCH_TOOL_CALL,
    },
    errors::ServerError,
    http::{CallArgs, Client},
    pii::obfuscate_auth_header,
 };
 use http::StatusCode;
-use log::{debug, trace, warn};
+use log::{debug, info, warn};
 use proxy_wasm::{traits::HttpContext, types::Action};
 use serde_json::Value;
 use std::{
@ -33,15 +34,37 @@ impl HttpContext for StreamContext {
        // manipulate the body in benign ways e.g., compression.
        self.set_http_request_header("content-length", None);

+        if let Some(overrides) = self.overrides.as_ref() {
+            if overrides.use_agent_orchestrator.unwrap_or_default() {
+                // get endpoint that has agent_orchestrator set to true
+                if let Some(endpoints) = self.endpoints.as_ref() {
+                    if endpoints.len() == 1 {
+                        let (name, _) = endpoints.iter().next().unwrap();
+                        info!("Setting ARCH_PROVIDER_HINT_HEADER to {}", name);
+                        self.set_http_request_header(ARCH_ROUTING_HEADER, Some(name));
+                    } else {
+                        warn!("Need single endpoint when use_agent_orchestrator is set");
+                        self.send_server_error(
+                            ServerError::LogicError(
+                                "Need single endpoint when use_agent_orchestrator is set"
+                                    .to_string(),
+                            ),
+                            None,
+                        );
+                    }
+                }
+            }
+        }
+
        let request_path = self.get_http_request_header(":path").unwrap_or_default();
        if request_path == HEALTHZ_PATH {
            self.send_http_response(200, vec![], None);
            return Action::Continue;
        }

-        self.is_chat_completions_request = request_path == CHAT_COMPLETIONS_PATH;
+        self.is_chat_completions_request = CHAT_COMPLETIONS_PATH.contains(&request_path.as_str());

-        trace!(
+        debug!(
            "on_http_request_headers S[{}] req_headers={:?}",
            self.context_id,
            obfuscate_auth_header(&mut self.get_http_request_headers())
@ -49,6 +72,7 @@ impl HttpContext for StreamContext {

        self.request_id = self.get_http_request_header(REQUEST_ID_HEADER);
        self.traceparent = self.get_http_request_header(TRACE_PARENT_HEADER);
+
        Action::Continue
    }

@ -66,10 +90,9 @@ impl HttpContext for StreamContext {

        self.request_body_size = body_size;

-        trace!(
+        debug!(
            "on_http_request_body S[{}] body_size={}",
-            self.context_id,
-            body_size
+            self.context_id, body_size
        );

        let body_bytes = match self.get_http_request_body(0, body_size) {
@ -86,7 +109,7 @@ impl HttpContext for StreamContext {
            }
        };

-        trace!("request body: {}", String::from_utf8_lossy(&body_bytes));
+        debug!("request body: {}", String::from_utf8_lossy(&body_bytes));

        // Deserialize body into spec.
        // Currently OpenAI API.
@ -103,8 +126,8 @@ impl HttpContext for StreamContext {

        self.arch_state = match deserialized_body.metadata {
            Some(ref metadata) => {
-                if metadata.contains_key(ARCH_STATE_HEADER) {
-                    let arch_state_str = metadata[ARCH_STATE_HEADER].clone();
+                if metadata.contains_key(X_ARCH_STATE_HEADER) {
+                    let arch_state_str = metadata[X_ARCH_STATE_HEADER].clone();
                    let arch_state: Vec<ArchState> = serde_json::from_str(&arch_state_str).unwrap();
                    Some(arch_state)
                } else {
@ -152,11 +175,23 @@ impl HttpContext for StreamContext {
            }
        }

+        if let Some(overrides) = self.overrides.as_ref() {
+            if overrides.use_agent_orchestrator.unwrap_or_default() {
+                if metadata.is_none() {
+                    metadata = Some(HashMap::new());
+                }
+                metadata
+                    .as_mut()
+                    .unwrap()
+                    .insert("use_agent_orchestrator".to_string(), "true".to_string());
+            }
+        }
+
        let arch_fc_chat_completion_request = ChatCompletionsRequest {
            messages: deserialized_body.messages.clone(),
            metadata,
            stream: deserialized_body.stream,
-            model: "--".to_string(),
+            model: deserialized_body.model.clone(),
            stream_options: deserialized_body.stream_options.clone(),
            tools: Some(tool_calls),
        };
@ -171,8 +206,10 @@ impl HttpContext for StreamContext {
            }
        };

-        debug!("sending request to model server");
-        trace!("request body: {}", json_data);
+        info!("on_http_request_body: sending request to model server");
+        debug!("request body: {}", json_data);
+
+        let timeout_str = MODEL_SERVER_REQUEST_TIMEOUT_MS.to_string();

        let timeout_str = MODEL_SERVER_REQUEST_TIMEOUT_MS.to_string();

@ -213,7 +250,7 @@ impl HttpContext for StreamContext {
        };

        if let Err(e) = self.http_call(call_args, call_context) {
-            debug!("http_call failed: {:?}", e);
+            warn!("http_call failed: {:?}", e);
            self.send_server_error(ServerError::HttpDispatch(e), None);
        }

@ -221,7 +258,7 @@ impl HttpContext for StreamContext {
    }

    fn on_http_response_headers(&mut self, _num_headers: usize, _end_of_stream: bool) -> Action {
-        trace!(
+        debug!(
            "on_http_response_headers recv [S={}] headers={:?}",
            self.context_id,
            self.get_http_response_headers()
@ -233,15 +270,13 @@ impl HttpContext for StreamContext {
    }

    fn on_http_response_body(&mut self, body_size: usize, end_of_stream: bool) -> Action {
-        trace!(
+        debug!(
            "on_http_response_body: recv [S={}] bytes={} end_stream={}",
-            self.context_id,
-            body_size,
-            end_of_stream
+            self.context_id, body_size, end_of_stream
        );

        if !self.is_chat_completions_request {
-            debug!("non-gpt request");
+            info!("non-gpt request");
            return Action::Continue;
        }

@ -280,7 +315,7 @@ impl HttpContext for StreamContext {

            streaming_chunk
        } else {
-            debug!("non streaming response bytes read: 0:{}", body_size);
+            info!("non streaming response bytes read: 0:{}", body_size);
            match self.get_http_response_body(0, body_size) {
                Some(body) => body,
                None => {
@ -293,21 +328,21 @@ impl HttpContext for StreamContext {
        let body_utf8 = match String::from_utf8(body) {
            Ok(body_utf8) => body_utf8,
            Err(e) => {
-                debug!("could not convert to utf8: {}", e);
+                info!("could not convert to utf8: {}", e);
                return Action::Continue;
            }
        };

        if self.streaming_response {
-            trace!("streaming response");
+            debug!("streaming response");

            if self.tool_calls.is_some() && !self.tool_calls.as_ref().unwrap().is_empty() {
                let chunks = vec![
                    ChatCompletionStreamResponse::new(
-                        None,
+                        self.arch_fc_response.clone(),
                        Some(ASSISTANT_ROLE.to_string()),
                        Some(ARCH_FC_MODEL_NAME.to_string()),
-                        self.tool_calls.to_owned(),
+                        None,
                    ),
                    ChatCompletionStreamResponse::new(
                        self.tool_call_response.clone(),
@ -349,25 +384,47 @@ impl HttpContext for StreamContext {
                        *metadata = Value::Object(serde_json::Map::new());
                    }

-                    let fc_messages = vec![
-                        self.generate_toll_call_message(),
-                        self.generate_api_response_message(),
-                    ];
+                    let tool_call_message = self.generate_tool_call_message();
+                    let tool_call_message_str = serde_json::to_string(&tool_call_message).unwrap();
+                    metadata.as_object_mut().unwrap().insert(
+                        X_ARCH_TOOL_CALL.to_string(),
+                        serde_json::Value::String(tool_call_message_str),
+                    );
+
+                    let api_response_message = self.generate_api_response_message();
+                    let api_response_message_str =
+                        serde_json::to_string(&api_response_message).unwrap();
+                    metadata.as_object_mut().unwrap().insert(
+                        X_ARCH_API_RESPONSE.to_string(),
+                        serde_json::Value::String(api_response_message_str),
+                    );
+
+                    let fc_messages = vec![tool_call_message, api_response_message];
+
                    let fc_messages_str = serde_json::to_string(&fc_messages).unwrap();
                    let arch_state = HashMap::from([("messages".to_string(), fc_messages_str)]);
                    let arch_state_str = serde_json::to_string(&arch_state).unwrap();
                    metadata.as_object_mut().unwrap().insert(
-                        ARCH_STATE_HEADER.to_string(),
+                        X_ARCH_STATE_HEADER.to_string(),
                        serde_json::Value::String(arch_state_str),
                    );
+
+                    if let Some(arch_fc_response) = self.arch_fc_response.as_ref() {
+                        metadata.as_object_mut().unwrap().insert(
+                            X_ARCH_FC_MODEL_RESPONSE.to_string(),
+                            serde_json::Value::String(
+                                serde_json::to_string(arch_fc_response).unwrap(),
+                            ),
+                        );
+                    }
                    let data_serialized = serde_json::to_string(&data).unwrap();
-                    debug!("archgw <= developer: {}", data_serialized);
+                    info!("archgw <= developer: {}", data_serialized);
                    self.set_http_response_body(0, body_size, data_serialized.as_bytes());
                };
            }
        }

-        trace!("recv [S={}] end_stream={}", self.context_id, end_of_stream);
+        debug!("recv [S={}] end_stream={}", self.context_id, end_of_stream);

        Action::Continue
    }
--- a/crates/prompt_gateway/src/stream_context.rs
+++ b/crates/prompt_gateway/src/stream_context.rs
@ -2,20 +2,21 @@ use crate::metrics::Metrics;
 use crate::tools::compute_request_path_body;
 use common::api::open_ai::{
    to_server_events, ArchState, ChatCompletionStreamResponse, ChatCompletionsRequest,
-    ChatCompletionsResponse, Message, ModelServerResponse, ToolCall,
+    ChatCompletionsResponse, Message, ToolCall,
 };
-use common::configuration::{Overrides, PromptTarget, Tracing};
+use common::configuration::{Endpoint, Overrides, PromptTarget, Tracing};
 use common::consts::{
    API_REQUEST_TIMEOUT_MS, ARCH_FC_MODEL_NAME, ARCH_INTERNAL_CLUSTER_NAME,
    ARCH_UPSTREAM_HOST_HEADER, ASSISTANT_ROLE, DEFAULT_TARGET_REQUEST_TIMEOUT_MS, MESSAGES_KEY,
    REQUEST_ID_HEADER, SYSTEM_ROLE, TOOL_ROLE, TRACE_PARENT_HEADER, USER_ROLE,
+    X_ARCH_FC_MODEL_RESPONSE,
 };
 use common::errors::ServerError;
 use common::http::{CallArgs, Client};
 use common::stats::Gauge;
 use derivative::Derivative;
 use http::StatusCode;
-use log::{debug, trace, warn};
+use log::{debug, info, warn};
 use proxy_wasm::traits::*;
 use std::cell::RefCell;
 use std::collections::HashMap;
@ -46,6 +47,7 @@ pub struct StreamCallContext {
 pub struct StreamContext {
    system_prompt: Rc<Option<String>>,
    pub prompt_targets: Rc<HashMap<String, PromptTarget>>,
+    pub endpoints: Rc<Option<HashMap<String, Endpoint>>>,
    pub overrides: Rc<Option<Overrides>>,
    pub metrics: Rc<Metrics>,
    pub callouts: RefCell<HashMap<u32, StreamCallContext>>,
@ -63,15 +65,16 @@ pub struct StreamContext {
    pub time_to_first_token: Option<u128>,
    pub traceparent: Option<String>,
    pub _tracing: Rc<Option<Tracing>>,
+    pub arch_fc_response: Option<String>,
 }

 impl StreamContext {
-    #[allow(clippy::too_many_arguments)]
    pub fn new(
        context_id: u32,
        metrics: Rc<Metrics>,
        system_prompt: Rc<Option<String>>,
        prompt_targets: Rc<HashMap<String, PromptTarget>>,
+        endpoints: Rc<Option<HashMap<String, Endpoint>>>,
        overrides: Rc<Option<Overrides>>,
        tracing: Rc<Option<Tracing>>,
    ) -> Self {
@ -80,6 +83,7 @@ impl StreamContext {
            metrics,
            system_prompt,
            prompt_targets,
+            endpoints,
            callouts: RefCell::new(HashMap::new()),
            chat_completions_request: None,
            tool_calls: None,
@ -95,6 +99,7 @@ impl StreamContext {
            _tracing: tracing,
            start_upstream_llm_request_time: 0,
            time_to_first_token: None,
+            arch_fc_response: None,
        }
    }

@ -125,10 +130,10 @@ impl StreamContext {
        mut callout_context: StreamCallContext,
    ) {
        let body_str = String::from_utf8(body).unwrap();
-        debug!("model server response received");
-        trace!("response body: {}", body_str);
+        info!("on_http_call_response: model server response received");
+        debug!("response body: {}", body_str);

-        let model_server_response: ModelServerResponse = match serde_json::from_str(&body_str) {
+        let model_server_response: ChatCompletionsResponse = match serde_json::from_str(&body_str) {
            Ok(arch_fc_response) => arch_fc_response,
            Err(e) => {
                warn!(
@ -139,77 +144,122 @@ impl StreamContext {
            }
        };

-        let arch_fc_response = match model_server_response {
-            ModelServerResponse::ChatCompletionsResponse(response) => response,
-            ModelServerResponse::ModelServerErrorResponse(response) => {
-                debug!("archgw <= modelserver error response: {}", response.result);
-                if response.result == "No intent matched" {
-                    if let Some(default_prompt_target) = self
-                        .prompt_targets
-                        .values()
-                        .find(|pt| pt.default.unwrap_or(false))
-                    {
-                        debug!("default prompt target found, forwarding request to default prompt target");
-                        let endpoint = default_prompt_target.endpoint.clone().unwrap();
-                        let upstream_path: String = endpoint.path.unwrap_or(String::from("/"));
+        let intent_matched = check_intent_matched(&model_server_response);
+        info!("intent matched: {}", intent_matched);

-                        let upstream_endpoint = endpoint.name;
-                        let mut params = HashMap::new();
-                        params.insert(
-                            MESSAGES_KEY.to_string(),
-                            callout_context.request_body.messages.clone(),
-                        );
-                        let arch_messages_json = serde_json::to_string(&params).unwrap();
-                        let timeout_str = DEFAULT_TARGET_REQUEST_TIMEOUT_MS.to_string();
+        self.arch_fc_response = model_server_response
+            .metadata
+            .as_ref()
+            .and_then(|metadata| metadata.get(X_ARCH_FC_MODEL_RESPONSE))
+            .cloned();

-                        let mut headers = vec![
-                            (":method", "POST"),
-                            (ARCH_UPSTREAM_HOST_HEADER, &upstream_endpoint),
-                            (":path", &upstream_path),
-                            (":authority", &upstream_endpoint),
-                            ("content-type", "application/json"),
-                            ("x-envoy-max-retries", "3"),
-                            ("x-envoy-upstream-rq-timeout-ms", timeout_str.as_str()),
-                        ];
+        if !intent_matched {
+            // check if we have a default prompt target
+            if let Some(default_prompt_target) = self
+                .prompt_targets
+                .values()
+                .find(|pt| pt.default.unwrap_or(false))
+            {
+                info!("default prompt target found, forwarding request to default prompt target");
+                let endpoint = default_prompt_target.endpoint.clone().unwrap();
+                let upstream_path: String = endpoint.path.unwrap_or(String::from("/"));

-                        if self.request_id.is_some() {
-                            headers.push((REQUEST_ID_HEADER, self.request_id.as_ref().unwrap()));
-                        }
+                let upstream_endpoint = endpoint.name;
+                let mut params = HashMap::new();
+                params.insert(
+                    MESSAGES_KEY.to_string(),
+                    callout_context.request_body.messages.clone(),
+                );
+                let arch_messages_json = serde_json::to_string(&params).unwrap();
+                let timeout_str = DEFAULT_TARGET_REQUEST_TIMEOUT_MS.to_string();

-                        // if self.trace_arch_internal() && self.traceparent.is_some() {
-                        //     headers.push((TRACE_PARENT_HEADER, self.traceparent.as_ref().unwrap()));
-                        // }
+                let mut headers = vec![
+                    (":method", "POST"),
+                    (ARCH_UPSTREAM_HOST_HEADER, &upstream_endpoint),
+                    (":path", &upstream_path),
+                    (":authority", &upstream_endpoint),
+                    ("content-type", "application/json"),
+                    ("x-envoy-max-retries", "3"),
+                    ("x-envoy-upstream-rq-timeout-ms", timeout_str.as_str()),
+                ];

-                        let call_args = CallArgs::new(
-                            ARCH_INTERNAL_CLUSTER_NAME,
-                            &upstream_path,
-                            headers,
-                            Some(arch_messages_json.as_bytes()),
-                            vec![],
-                            Duration::from_secs(5),
-                        );
-                        callout_context.response_handler_type = ResponseHandlerType::DefaultTarget;
-                        callout_context.prompt_target_name =
-                            Some(default_prompt_target.name.clone());
+                if self.request_id.is_some() {
+                    headers.push((REQUEST_ID_HEADER, self.request_id.as_ref().unwrap()));
+                }

-                        if let Err(e) = self.http_call(call_args, callout_context) {
-                            warn!("error dispatching default prompt target request: {}", e);
-                            return self.send_server_error(
-                                ServerError::HttpDispatch(e),
-                                Some(StatusCode::BAD_REQUEST),
-                            );
-                        }
-                        return;
+                let call_args = CallArgs::new(
+                    ARCH_INTERNAL_CLUSTER_NAME,
+                    &upstream_path,
+                    headers,
+                    Some(arch_messages_json.as_bytes()),
+                    vec![],
+                    Duration::from_secs(5),
+                );
+                callout_context.response_handler_type = ResponseHandlerType::DefaultTarget;
+                callout_context.prompt_target_name = Some(default_prompt_target.name.clone());
+
+                if let Err(e) = self.http_call(call_args, callout_context) {
+                    warn!("error dispatching default prompt target request: {}", e);
+                    return self.send_server_error(
+                        ServerError::HttpDispatch(e),
+                        Some(StatusCode::BAD_REQUEST),
+                    );
+                }
+                return;
+            } else {
+                info!("no default prompt target found, forwarding request to upstream llm");
+                let mut messages = Vec::new();
+                // add system prompt
+                match self.system_prompt.as_ref() {
+                    None => {}
+                    Some(system_prompt) => {
+                        let system_prompt_message = Message {
+                            role: SYSTEM_ROLE.to_string(),
+                            content: Some(system_prompt.clone()),
+                            model: None,
+                            tool_calls: None,
+                            tool_call_id: None,
+                        };
+                        messages.push(system_prompt_message);
                    }
                }
-                return self.send_server_error(
-                    ServerError::LogicError(response.result),
-                    Some(StatusCode::BAD_REQUEST),
-                );
-            }
-        };

-        arch_fc_response.choices[0]
+                messages.append(
+                    &mut self
+                        .filter_out_arch_messages(callout_context.request_body.messages.as_ref()),
+                );
+
+                let chat_completion_request = ChatCompletionsRequest {
+                    model: self
+                        .chat_completions_request
+                        .as_ref()
+                        .unwrap()
+                        .model
+                        .clone(),
+                    messages,
+                    tools: None,
+                    stream: callout_context.request_body.stream,
+                    stream_options: callout_context.request_body.stream_options,
+                    metadata: None,
+                };
+
+                let chat_completion_request_json =
+                    serde_json::to_string(&chat_completion_request).unwrap();
+                info!(
+                    "archgw => upstream llm request: {}",
+                    chat_completion_request_json
+                );
+                self.set_http_request_body(
+                    0,
+                    self.request_body_size,
+                    chat_completion_request_json.as_bytes(),
+                );
+                self.resume_http_request();
+                return;
+            }
+        }
+
+        model_server_response.choices[0]
            .message
            .tool_calls
            .clone_into(&mut self.tool_calls);
@ -231,14 +281,14 @@ impl StreamContext {
            let direct_response_str = if self.streaming_response {
                let chunks = vec![
                    ChatCompletionStreamResponse::new(
-                        None,
+                        self.arch_fc_response.clone(),
                        Some(ASSISTANT_ROLE.to_string()),
-                        Some(ARCH_FC_MODEL_NAME.to_owned()),
+                        Some(ARCH_FC_MODEL_NAME.to_string()),
                        None,
                    ),
                    ChatCompletionStreamResponse::new(
                        Some(
-                            arch_fc_response.choices[0]
+                            model_server_response.choices[0]
                                .message
                                .content
                                .as_ref()
@ -246,7 +296,7 @@ impl StreamContext {
                                .clone(),
                        ),
                        None,
-                        Some(ARCH_FC_MODEL_NAME.to_owned()),
+                        Some(format!("{}-Chat", ARCH_FC_MODEL_NAME.to_owned())),
                        None,
                    ),
                ];
@ -268,12 +318,59 @@ impl StreamContext {
        callout_context.prompt_target_name =
            Some(self.tool_calls.as_ref().unwrap()[0].function.name.clone());

+        if let Some(overrides) = self.overrides.as_ref() {
+            if overrides.use_agent_orchestrator.unwrap_or_default() {
+                let mut metadata = HashMap::new();
+                metadata.insert("use_agent_orchestrator".to_string(), "true".to_string());
+
+                metadata.insert(
+                    "agent-name".to_string(),
+                    callout_context
+                        .prompt_target_name
+                        .as_ref()
+                        .unwrap()
+                        .to_string(),
+                );
+
+                if let Some(overrides) = self.overrides.as_ref() {
+                    if overrides.optimize_context_window.unwrap_or_default() {
+                        metadata.insert("optimize_context_window".to_string(), "true".to_string());
+                    }
+                }
+
+                if let Some(overrides) = self.overrides.as_ref() {
+                    if overrides.use_agent_orchestrator.unwrap_or_default() {
+                        metadata.insert("use_agent_orchestrator".to_string(), "true".to_string());
+                    }
+                }
+
+                let messages = self.construct_llm_messages(&callout_context);
+
+                let chat_completion_request = ChatCompletionsRequest {
+                    model: callout_context.request_body.model.clone(),
+                    messages,
+                    tools: None,
+                    stream: callout_context.request_body.stream,
+                    stream_options: callout_context.request_body.stream_options.clone(),
+                    metadata: Some(metadata),
+                };
+
+                let body_str = serde_json::to_string(&chat_completion_request).unwrap();
+                info!("sending request to llm agent: {}", body_str);
+                self.set_http_request_body(0, self.request_body_size, body_str.as_bytes());
+                self.resume_http_request();
+                return;
+            }
+        }
+
        self.schedule_api_call_request(callout_context);
    }

    fn schedule_api_call_request(&mut self, mut callout_context: StreamCallContext) {
+        // Construct messages early to avoid mutable borrow conflicts
+
        let tools_call_name = self.tool_calls.as_ref().unwrap()[0].function.name.clone();
-        let prompt_target = self.prompt_targets.get(&tools_call_name).unwrap();
+        let prompt_target = self.prompt_targets.get(&tools_call_name).unwrap().clone();
        let tool_params = &self.tool_calls.as_ref().unwrap()[0].function.arguments;
        let endpoint_details = prompt_target.endpoint.as_ref().unwrap();
        let endpoint_path: String = endpoint_details
@ -285,7 +382,7 @@ impl StreamContext {
        let http_method = endpoint_details.method.clone().unwrap_or_default();
        let prompt_target_params = prompt_target.parameters.clone().unwrap_or_default();

-        let (path, body) = match compute_request_path_body(
+        let (path, api_call_body) = match compute_request_path_body(
            &endpoint_path,
            tool_params,
            &prompt_target_params,
@ -302,6 +399,8 @@ impl StreamContext {
            }
        };

+        debug!("on_http_call_response: api call body {:?}", api_call_body);
+
        let timeout_str = API_REQUEST_TIMEOUT_MS.to_string();

        let http_method_str = http_method.to_string();
@ -335,13 +434,13 @@ impl StreamContext {
            ARCH_INTERNAL_CLUSTER_NAME,
            &path,
            headers.into_iter().collect(),
-            body.as_deref().map(|s| s.as_bytes()),
+            api_call_body.as_deref().map(|s| s.as_bytes()),
            vec![],
            Duration::from_secs(5),
        );

-        debug!(
-            "dispatching api call to developer endpoint: {}, path: {}, method: {}",
+        info!(
+            "on_http_call_response: dispatching api call to developer endpoint: {}, path: {}, method: {}",
            endpoint_details.name, path, http_method_str
        );

@ -358,10 +457,15 @@ impl StreamContext {
        let http_status = self
            .get_http_call_response_header(":status")
            .unwrap_or(StatusCode::OK.as_str().to_string());
-        debug!(
-            "developer api call response received: status code: {}",
+        info!(
+            "on_http_call_response: developer api call response received: status code: {}",
            http_status
        );
+        let prompt_target = self
+            .prompt_targets
+            .get(callout_context.prompt_target_name.as_ref().unwrap())
+            .unwrap()
+            .clone();
        if http_status != StatusCode::OK.as_str() {
            warn!(
                "api server responded with non 2xx status code: {}",
@ -378,7 +482,7 @@ impl StreamContext {
            );
        }
        self.tool_call_response = Some(String::from_utf8(body).unwrap());
-        trace!(
+        debug!(
            "response body: {}",
            self.tool_call_response.as_ref().unwrap()
        );
@ -397,6 +501,37 @@ impl StreamContext {
            }
        };

+        if !prompt_target.auto_llm_dispatch_on_response.unwrap_or(true) {
+            let tool_call_response = self.tool_call_response.as_ref().unwrap().clone();
+
+            let direct_response_str = if self.streaming_response {
+                let chunks = vec![
+                    ChatCompletionStreamResponse::new(
+                        None,
+                        Some(ASSISTANT_ROLE.to_string()),
+                        Some(ARCH_FC_MODEL_NAME.to_owned()),
+                        None,
+                    ),
+                    ChatCompletionStreamResponse::new(
+                        Some(tool_call_response.clone()),
+                        None,
+                        Some(ARCH_FC_MODEL_NAME.to_owned()),
+                        None,
+                    ),
+                ];
+
+                to_server_events(chunks)
+            } else {
+                tool_call_response
+            };
+
+            return self.send_http_response(
+                StatusCode::OK.as_u16().into(),
+                vec![],
+                Some(direct_response_str.as_bytes()),
+            );
+        }
+
        let final_prompt = format!(
            "{}\ncontext: {}",
            user_message.content.unwrap(),
@ -429,8 +564,8 @@ impl StreamContext {
                return self.send_server_error(ServerError::Serialization(e), None);
            }
        };
-        debug!("sending request to upstream llm");
-        trace!("request body: {}", llm_request_str);
+        info!("on_http_call_response: sending request to upstream llm");
+        debug!("request body: {}", llm_request_str);

        self.start_upstream_llm_request_time = SystemTime::now()
            .duration_since(UNIX_EPOCH)
@ -491,13 +626,24 @@ impl StreamContext {
        messages
    }

-    pub fn generate_toll_call_message(&mut self) -> Message {
-        Message {
-            role: ASSISTANT_ROLE.to_string(),
-            content: None,
-            model: Some(ARCH_FC_MODEL_NAME.to_string()),
-            tool_calls: self.tool_calls.clone(),
-            tool_call_id: None,
+    pub fn generate_tool_call_message(&mut self) -> Message {
+        if self.arch_fc_response.is_none() {
+            info!("arch_fc_response is none, generating tool call message");
+            Message {
+                role: ASSISTANT_ROLE.to_string(),
+                content: None,
+                model: Some(ARCH_FC_MODEL_NAME.to_string()),
+                tool_calls: self.tool_calls.clone(),
+                tool_call_id: None,
+            }
+        } else {
+            Message {
+                role: ASSISTANT_ROLE.to_string(),
+                content: self.arch_fc_response.as_ref().cloned(),
+                model: Some(ARCH_FC_MODEL_NAME.to_string()),
+                tool_calls: None,
+                tool_call_id: None,
+            }
        }
    }

@ -519,10 +665,7 @@ impl StreamContext {
            .clone();

        // check if the default target should be dispatched to the LLM provider
-        if !prompt_target
-            .auto_llm_dispatch_on_response
-            .unwrap_or_default()
-        {
+        if !prompt_target.auto_llm_dispatch_on_response.unwrap_or(true) {
            let default_target_response_str = if self.streaming_response {
                let chat_completion_response =
                    match serde_json::from_slice::<ChatCompletionsResponse>(&body) {
@ -626,12 +769,29 @@ impl StreamContext {
        };

        let json_resp = serde_json::to_string(&chat_completion_request).unwrap();
-        debug!("archgw => (default target) llm request: {}", json_resp);
+        info!("archgw => (default target) llm request: {}", json_resp);
        self.set_http_request_body(0, self.request_body_size, json_resp.as_bytes());
        self.resume_http_request();
    }
 }

+fn check_intent_matched(model_server_response: &ChatCompletionsResponse) -> bool {
+    let content = model_server_response
+        .choices.first()
+        .and_then(|choice| choice.message.content.as_ref());
+
+    let content_has_value = content.is_some() && !content.unwrap().is_empty();
+
+    let tool_calls = model_server_response
+        .choices.first()
+        .and_then(|choice| choice.message.tool_calls.as_ref());
+
+    // intent was matched if content has some value or tool_calls is empty
+
+
+    content_has_value || (tool_calls.is_some() && !tool_calls.unwrap().is_empty())
+}
+
 impl Client for StreamContext {
    type CallContext = StreamCallContext;

@ -643,3 +803,77 @@ impl Client for StreamContext {
        &self.metrics.active_http_calls
    }
 }
+
+#[cfg(test)]
+mod test {
+    use common::api::open_ai::{ChatCompletionsResponse, Choice, Message, ToolCall};
+
+    use crate::stream_context::check_intent_matched;
+
+    #[test]
+    fn test_intent_matched() {
+        let model_server_response = ChatCompletionsResponse {
+            choices: vec![Choice {
+                message: Message {
+                    content: Some("".to_string()),
+                    tool_calls: Some(vec![]),
+                    role: "assistant".to_string(),
+                    model: None,
+                    tool_call_id: None,
+                },
+                finish_reason: None,
+                index: None,
+            }],
+            usage: None,
+            model: "arch-fc".to_string(),
+            metadata: None,
+        };
+
+        assert!(!check_intent_matched(&model_server_response));
+
+        let model_server_response = ChatCompletionsResponse {
+            choices: vec![Choice {
+                message: Message {
+                    content: Some("hello".to_string()),
+                    tool_calls: Some(vec![]),
+                    role: "assistant".to_string(),
+                    model: None,
+                    tool_call_id: None,
+                },
+                finish_reason: None,
+                index: None,
+            }],
+            usage: None,
+            model: "arch-fc".to_string(),
+            metadata: None,
+        };
+
+        assert!(check_intent_matched(&model_server_response));
+
+        let model_server_response = ChatCompletionsResponse {
+            choices: vec![Choice {
+                message: Message {
+                    content: Some("".to_string()),
+                    tool_calls: Some(vec![ToolCall {
+                        id: "1".to_string(),
+                        function: common::api::open_ai::FunctionCallDetail {
+                            name: "test".to_string(),
+                            arguments: None,
+                        },
+                        tool_type: common::api::open_ai::ToolType::Function,
+                    }]),
+                    role: "assistant".to_string(),
+                    model: None,
+                    tool_call_id: None,
+                },
+                finish_reason: None,
+                index: None,
+            }],
+            usage: None,
+            model: "arch-fc".to_string(),
+            metadata: None,
+        };
+
+        assert!(check_intent_matched(&model_server_response));
+    }
+}
--- a/crates/prompt_gateway/src/tools.rs
+++ b/crates/prompt_gateway/src/tools.rs
@ -4,8 +4,13 @@ use std::collections::HashMap;
 use serde_yaml::Value;

 // only add params that are of string, number and bool type
-pub fn filter_tool_params(tool_params: &HashMap<String, Value>) -> HashMap<String, String> {
+pub fn filter_tool_params(tool_params: &Option<HashMap<String, Value>>) -> HashMap<String, String> {
+    if tool_params.is_none() {
+        return HashMap::new();
+    }
    tool_params
+        .as_ref()
+        .unwrap()
        .iter()
        .filter(|(_, value)| value.is_number() || value.is_string() || value.is_bool())
        .map(|(key, value)| match value {
@ -22,7 +27,7 @@ pub fn filter_tool_params(tool_params: &HashMap<String, Value>) -> HashMap<Strin

 pub fn compute_request_path_body(
    endpoint_path: &str,
-    tool_params: &HashMap<String, Value>,
+    tool_params: &Option<HashMap<String, Value>>,
    prompt_target_params: &[Parameter],
    http_method: &HttpMethod,
 ) -> Result<(String, Option<String>), String> {
--- a/crates/prompt_gateway/tests/integration.rs
+++ b/crates/prompt_gateway/tests/integration.rs
@ -24,12 +24,12 @@ fn wasm_module() -> String {
 fn request_headers_expectations(module: &mut Tester, http_context: i32) {
    module
        .call_proxy_on_request_headers(http_context, 0, false)
+        .expect_log(Some(LogLevel::Debug), None)
        .expect_remove_header_map_value(Some(MapType::HttpRequestHeaders), Some("content-length"))
        .expect_get_header_map_value(Some(MapType::HttpRequestHeaders), Some(":path"))
        .returning(Some("/v1/chat/completions"))
        .expect_get_header_map_pairs(Some(MapType::HttpRequestHeaders))
        .returning(None)
-        .expect_log(Some(LogLevel::Trace), None)
        .expect_get_header_map_value(Some(MapType::HttpRequestHeaders), Some("x-request-id"))
        .returning(None)
        .expect_get_header_map_value(Some(MapType::HttpRequestHeaders), Some("traceparent"))
@ -69,10 +69,14 @@ fn normal_flow(module: &mut Tester, filter_context: i32, http_context: i32) {
            chat_completions_request_body.len() as i32,
            true,
        )
+        .expect_log(Some(LogLevel::Debug), None)
        .expect_get_buffer_bytes(Some(BufferType::HttpRequestBody))
        .returning(Some(chat_completions_request_body))
        // The actual call is not important in this test, we just need to grab the token_id
-        .expect_log(Some(LogLevel::Trace), None)
+        .expect_log(Some(LogLevel::Debug), None)
+        .expect_log(Some(LogLevel::Info), None)
+        .expect_log(Some(LogLevel::Debug), None)
+        .expect_log(Some(LogLevel::Debug), None)
        .expect_http_call(
            Some("arch_internal"),
            Some(vec![
@ -81,16 +85,13 @@ fn normal_flow(module: &mut Tester, filter_context: i32, http_context: i32) {
                (":path", "/function_calling"),
                ("content-type", "application/json"),
                (":authority", "model_server"),
+                ("x-envoy-upstream-rq-timeout-ms", "30000"),
            ]),
            None,
            None,
-            None,
+            Some(5000),
        )
        .returning(Some(1))
-        .expect_log(Some(LogLevel::Trace), None)
-        .expect_log(Some(LogLevel::Debug), None)
-        .expect_log(Some(LogLevel::Trace), None)
-        .expect_log(Some(LogLevel::Trace), None)
        .expect_metric_increment("active_http_calls", 1)
        .execute_and_expect(ReturnType::Action(Action::Pause))
        .unwrap();
@ -232,13 +233,13 @@ fn prompt_gateway_successful_request_to_open_ai_chat_completions() {
            chat_completions_request_body.len() as i32,
            true,
        )
+        .expect_log(Some(LogLevel::Debug), None)
        .expect_get_buffer_bytes(Some(BufferType::HttpRequestBody))
        .returning(Some(chat_completions_request_body))
-        .expect_log(Some(LogLevel::Trace), None)
-        .expect_log(Some(LogLevel::Trace), None)
        .expect_log(Some(LogLevel::Debug), None)
-        .expect_log(Some(LogLevel::Trace), None)
-        .expect_log(Some(LogLevel::Trace), None)
+        .expect_log(Some(LogLevel::Info), None)
+        .expect_log(Some(LogLevel::Debug), None)
+        .expect_log(Some(LogLevel::Debug), None)
        .expect_http_call(Some("arch_internal"), None, None, None, None)
        .returning(Some(4))
        .expect_metric_increment("active_http_calls", 1)
@ -295,16 +296,16 @@ fn prompt_gateway_bad_request_to_open_ai_chat_completions() {
            incomplete_chat_completions_request_body.len() as i32,
            true,
        )
+        .expect_log(Some(LogLevel::Debug), None)
        .expect_get_buffer_bytes(Some(BufferType::HttpRequestBody))
        .returning(Some(incomplete_chat_completions_request_body))
-        .expect_log(Some(LogLevel::Trace), None)
+        .expect_log(Some(LogLevel::Debug), None)
        .expect_send_local_response(
            Some(StatusCode::BAD_REQUEST.as_u16().into()),
            None,
            None,
            None,
        )
-        .expect_log(Some(LogLevel::Trace), None)
        .execute_and_expect(ReturnType::Action(Action::Pause))
        .unwrap();
 }
@ -351,10 +352,10 @@ fn prompt_gateway_request_to_llm_gateway() {
                    tool_type: ToolType::Function,
                    function: FunctionCallDetail {
                        name: String::from("weather_forecast"),
-                        arguments: HashMap::from([(
+                        arguments: Some(HashMap::from([(
                            String::from("city"),
                            Value::String(String::from("seattle")),
-                        )]),
+                        )])),
                    },
                }]),
                model: None,
@ -362,7 +363,11 @@ fn prompt_gateway_request_to_llm_gateway() {
            },
        }],
        model: String::from("test"),
-        metadata: None,
+        metadata: {
+            let mut map: HashMap<String, String> = HashMap::new();
+            map.insert("function_latency".to_string(), "0.0".to_string());
+            Some(map)
+        },
    };

    let expected_body = "{\"city\":\"seattle\"}";
@ -373,27 +378,30 @@ fn prompt_gateway_request_to_llm_gateway() {
        .expect_get_buffer_bytes(Some(BufferType::HttpCallResponseBody))
        .returning(Some(&arch_fc_resp_str))
        .expect_log(Some(LogLevel::Warn), None)
+        .expect_log(Some(LogLevel::Info), None)
        .expect_log(Some(LogLevel::Debug), None)
-        .expect_log(Some(LogLevel::Trace), None)
+        .expect_log(Some(LogLevel::Info), None)
+        .expect_log(Some(LogLevel::Debug), None)
+        .expect_log(Some(LogLevel::Info), None)
        .expect_log(Some(LogLevel::Debug), None)
-        .expect_log(Some(LogLevel::Trace), None)
-        .expect_log(Some(LogLevel::Trace), None)
        .expect_http_call(
            Some("arch_internal"),
            Some(vec![
-                (":method", "POST"),
-                ("content-type", "application/json"),
-                ("x-arch-upstream", "api_server"),
-                (":authority", "api_server"),
                ("x-envoy-max-retries", "3"),
+                ("x-arch-upstream", "api_server"),
+                ("content-type", "application/json"),
+                ("x-envoy-upstream-rq-timeout-ms", "30000"),
                (":path", "/weather"),
+                (":method", "POST"),
+                (":authority", "api_server"),
            ]),
            Some(expected_body),
            None,
-            None,
+            Some(5000),
        )
        .returning(Some(2))
        .expect_metric_increment("active_http_calls", 1)
+        .expect_log(Some(LogLevel::Trace), None)
        .execute_and_expect(ReturnType::None)
        .unwrap();

@ -403,14 +411,14 @@ fn prompt_gateway_request_to_llm_gateway() {
        .expect_metric_increment("active_http_calls", -1)
        .expect_get_buffer_bytes(Some(BufferType::HttpCallResponseBody))
        .returning(Some(&body_text))
+        .expect_log(Some(LogLevel::Info), None)
        .expect_log(Some(LogLevel::Debug), None)
-        .expect_log(Some(LogLevel::Trace), None)
+        .expect_log(Some(LogLevel::Info), None)
        .expect_log(Some(LogLevel::Debug), None)
-        .expect_log(Some(LogLevel::Trace), None)
-        .expect_log(Some(LogLevel::Trace), None)
        .expect_get_header_map_value(Some(MapType::HttpCallResponseHeaders), Some(":status"))
        .returning(Some("200"))
        .expect_set_buffer_bytes(Some(BufferType::HttpRequestBody), None)
+        .expect_log(Some(LogLevel::Debug), None)
        .execute_and_expect(ReturnType::None)
        .unwrap();

@ -442,11 +450,241 @@ fn prompt_gateway_request_to_llm_gateway() {
        )
        .expect_get_buffer_bytes(Some(BufferType::HttpResponseBody))
        .returning(Some(chat_completion_response_str.as_str()))
-        .expect_log(Some(LogLevel::Trace), None)
        .expect_log(Some(LogLevel::Debug), None)
+        .expect_log(Some(LogLevel::Info), None)
        .expect_set_buffer_bytes(Some(BufferType::HttpResponseBody), None)
+        .expect_log(Some(LogLevel::Info), None)
+        .expect_log(Some(LogLevel::Info), None)
        .expect_log(Some(LogLevel::Debug), None)
-        .expect_log(Some(LogLevel::Trace), None)
        .execute_and_expect(ReturnType::Action(Action::Continue))
        .unwrap();
 }
+
+#[test]
+#[serial]
+fn prompt_gateway_request_no_intent_match() {
+    let args = tester::MockSettings {
+        wasm_path: wasm_module(),
+        quiet: false,
+        allow_unexpected: false,
+    };
+    let mut module = tester::mock(args).unwrap();
+
+    module
+        .call_start()
+        .execute_and_expect(ReturnType::None)
+        .unwrap();
+
+    // Setup Filter
+    let mut config: Configuration = serde_yaml::from_str(default_config()).unwrap();
+    config.ratelimits.as_mut().unwrap()[0].limit.tokens += 1000;
+    let config_str = serde_json::to_string(&config).unwrap();
+
+    let filter_context = setup_filter(&mut module, &config_str);
+
+    // Setup HTTP Stream
+    let http_context = 2;
+
+    normal_flow(&mut module, filter_context, http_context);
+
+    let arch_fc_resp = ChatCompletionsResponse {
+        usage: Some(Usage {
+            completion_tokens: 0,
+        }),
+        choices: vec![Choice {
+            finish_reason: Some("test".to_string()),
+            index: Some(0),
+            message: Message {
+                role: "assistant".to_string(),
+                content: None,
+                tool_calls: None,
+                model: None,
+                tool_call_id: None,
+            },
+        }],
+        model: String::from("test"),
+        metadata: None,
+    };
+
+    let arch_fc_resp_str = serde_json::to_string(&arch_fc_resp).unwrap();
+    module
+        .call_proxy_on_http_call_response(http_context, 1, 0, arch_fc_resp_str.len() as i32, 0)
+        .expect_metric_increment("active_http_calls", -1)
+        .expect_get_buffer_bytes(Some(BufferType::HttpCallResponseBody))
+        .returning(Some(&arch_fc_resp_str))
+        .expect_log(Some(LogLevel::Warn), None)
+        .expect_log(Some(LogLevel::Info), None)
+        .expect_log(Some(LogLevel::Debug), None)
+        .expect_log(Some(LogLevel::Info), Some("intent matched: false"))
+        .expect_log(
+            Some(LogLevel::Info),
+            Some("no default prompt target found, forwarding request to upstream llm"),
+        )
+        .expect_log(Some(LogLevel::Info), None)
+        .expect_log(Some(LogLevel::Info), None)
+        .expect_set_buffer_bytes(Some(BufferType::HttpRequestBody), None)
+        .execute_and_expect(ReturnType::None)
+        .unwrap();
+}
+
+fn arch_config_default_target() -> &'static str {
+    r#"
+version: "0.1-beta"
+
+listener:
+  address: 0.0.0.0
+  port: 10000
+  message_format: huggingface
+  connect_timeout: 0.005s
+
+endpoints:
+  api_server:
+    endpoint: api_server:80
+    connect_timeout: 0.005s
+
+llm_providers:
+  - name: open-ai-gpt-4
+    provider_interface: openai
+    access_key: secret_key
+    model: gpt-4
+    default: true
+
+overrides:
+  # confidence threshold for prompt target intent matching
+  prompt_target_intent_matching_threshold: 0.0
+
+system_prompt: |
+  You are a helpful assistant.
+
+prompt_guards:
+  input_guards:
+    jailbreak:
+      on_exception:
+        message: "Looks like you're curious about my abilities, but I can only provide assistance within my programmed parameters."
+
+prompt_targets:
+  - name: weather_forecast
+    description: This function provides realtime weather forecast information for a given city.
+    parameters:
+      - name: city
+        required: true
+        description: The city for which the weather forecast is requested.
+      - name: days
+        description: The number of days for which the weather forecast is requested.
+      - name: units
+        description: The units in which the weather forecast is requested.
+    endpoint:
+      name: api_server
+      path: /weather
+      http_method: POST
+    system_prompt: |
+      You are a helpful weather forecaster. Use weater data that is provided to you. Please following following guidelines when responding to user queries:
+      - Use farenheight for temperature
+      - Use miles per hour for wind speed
+
+  - name: default_target
+    default: true
+    description: This is the default target for all unmatched prompts.
+    endpoint:
+      name: weather_forecast_service
+      path: /default_target
+      http_method: POST
+    system_prompt: |
+      You are a helpful assistant! Summarize the user's request and provide a helpful response.
+    # if it is set to false arch will send response that it received from this prompt target to the user
+    # if true arch will forward the response to the default LLM
+    auto_llm_dispatch_on_response: false
+
+ratelimits:
+  - model: gpt-4
+    selector:
+      key: selector-key
+      value: selector-value
+    limit:
+      tokens: 1
+      unit: minute
+"#
+}
+
+#[test]
+#[serial]
+fn prompt_gateway_request_no_intent_match_default_target() {
+    let args = tester::MockSettings {
+        wasm_path: wasm_module(),
+        quiet: false,
+        allow_unexpected: false,
+    };
+    let mut module = tester::mock(args).unwrap();
+
+    module
+        .call_start()
+        .execute_and_expect(ReturnType::None)
+        .unwrap();
+
+    // Setup Filter
+    let mut config: Configuration = serde_yaml::from_str(arch_config_default_target()).unwrap();
+    config.ratelimits.as_mut().unwrap()[0].limit.tokens += 1000;
+    let config_str = serde_json::to_string(&config).unwrap();
+
+    let filter_context = setup_filter(&mut module, &config_str);
+
+    // Setup HTTP Stream
+    let http_context = 2;
+
+    normal_flow(&mut module, filter_context, http_context);
+
+    let arch_fc_resp = ChatCompletionsResponse {
+        usage: Some(Usage {
+            completion_tokens: 0,
+        }),
+        choices: vec![Choice {
+            finish_reason: Some("test".to_string()),
+            index: Some(0),
+            message: Message {
+                role: "system".to_string(),
+                content: None,
+                tool_calls: None,
+                model: None,
+                tool_call_id: None,
+            },
+        }],
+        model: String::from("test"),
+        metadata: None,
+    };
+
+    let arch_fc_resp_str = serde_json::to_string(&arch_fc_resp).unwrap();
+    module
+        .call_proxy_on_http_call_response(http_context, 1, 0, arch_fc_resp_str.len() as i32, 0)
+        .expect_metric_increment("active_http_calls", -1)
+        .expect_get_buffer_bytes(Some(BufferType::HttpCallResponseBody))
+        .returning(Some(&arch_fc_resp_str))
+        .expect_log(Some(LogLevel::Warn), None)
+        .expect_log(Some(LogLevel::Info), None)
+        .expect_log(Some(LogLevel::Debug), None)
+        .expect_log(Some(LogLevel::Info), Some("intent matched: false"))
+        .expect_log(
+            Some(LogLevel::Info),
+            Some("default prompt target found, forwarding request to default prompt target"),
+        )
+        .expect_log(Some(LogLevel::Debug), None)
+        .expect_log(Some(LogLevel::Info), None)
+        .expect_http_call(
+            Some("arch_internal"),
+            Some(vec![
+                (":method", "POST"),
+                ("x-arch-upstream", "weather_forecast_service"),
+                (":path", "/default_target"),
+                (":authority", "weather_forecast_service"),
+                ("content-type", "application/json"),
+                ("x-envoy-max-retries", "3"),
+                ("x-envoy-upstream-rq-timeout-ms", "30000"),
+            ]),
+            None,
+            None,
+            Some(5000),
+        )
+        .returning(Some(2))
+        .expect_metric_increment("active_http_calls", 1)
+        .execute_and_expect(ReturnType::None)
+        .unwrap();
+}
--- a/demos/samples_java/weather_forcecast_service/.classpath
+++ b/demos/samples_java/weather_forcecast_service/.classpath
@ -0,0 +1,49 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<classpath>
+	<classpathentry kind="src" output="target/classes" path="src/main/java">
+		<attributes>
+			<attribute name="optional" value="true"/>
+			<attribute name="maven.pomderived" value="true"/>
+		</attributes>
+	</classpathentry>
+	<classpathentry excluding="**" kind="src" output="target/classes" path="src/main/resources">
+		<attributes>
+			<attribute name="maven.pomderived" value="true"/>
+		</attributes>
+	</classpathentry>
+	<classpathentry kind="src" output="target/test-classes" path="src/test/java">
+		<attributes>
+			<attribute name="optional" value="true"/>
+			<attribute name="maven.pomderived" value="true"/>
+			<attribute name="test" value="true"/>
+		</attributes>
+	</classpathentry>
+	<classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER/org.eclipse.jdt.internal.debug.ui.launcher.StandardVMType/JavaSE-1.8">
+		<attributes>
+			<attribute name="maven.pomderived" value="true"/>
+		</attributes>
+	</classpathentry>
+	<classpathentry kind="con" path="org.eclipse.m2e.MAVEN2_CLASSPATH_CONTAINER">
+		<attributes>
+			<attribute name="maven.pomderived" value="true"/>
+		</attributes>
+	</classpathentry>
+	<classpathentry kind="src" path="target/generated-sources/annotations">
+		<attributes>
+			<attribute name="optional" value="true"/>
+			<attribute name="maven.pomderived" value="true"/>
+			<attribute name="ignore_optional_problems" value="true"/>
+			<attribute name="m2e-apt" value="true"/>
+		</attributes>
+	</classpathentry>
+	<classpathentry kind="src" output="target/test-classes" path="target/generated-test-sources/test-annotations">
+		<attributes>
+			<attribute name="optional" value="true"/>
+			<attribute name="maven.pomderived" value="true"/>
+			<attribute name="ignore_optional_problems" value="true"/>
+			<attribute name="m2e-apt" value="true"/>
+			<attribute name="test" value="true"/>
+		</attributes>
+	</classpathentry>
+	<classpathentry kind="output" path="target/classes"/>
+</classpath>
--- a/demos/samples_java/weather_forcecast_service/.project
+++ b/demos/samples_java/weather_forcecast_service/.project
@ -0,0 +1,34 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<projectDescription>
+	<name>weather-forecast-service</name>
+	<comment></comment>
+	<projects>
+	</projects>
+	<buildSpec>
+		<buildCommand>
+			<name>org.eclipse.jdt.core.javabuilder</name>
+			<arguments>
+			</arguments>
+		</buildCommand>
+		<buildCommand>
+			<name>org.eclipse.m2e.core.maven2Builder</name>
+			<arguments>
+			</arguments>
+		</buildCommand>
+	</buildSpec>
+	<natures>
+		<nature>org.eclipse.jdt.core.javanature</nature>
+		<nature>org.eclipse.m2e.core.maven2Nature</nature>
+	</natures>
+	<filteredResources>
+		<filter>
+			<id>1742579142020</id>
+			<name></name>
+			<type>30</type>
+			<matcher>
+				<id>org.eclipse.core.resources.regexFilterMatcher</id>
+				<arguments>node_modules|\.git|__CREATED_BY_JAVA_LANGUAGE_SERVER__</arguments>
+			</matcher>
+		</filter>
+	</filteredResources>
+</projectDescription>
--- a/demos/samples_java/weather_forcecast_service/.settings/org.eclipse.core.resources.prefs
+++ b/demos/samples_java/weather_forcecast_service/.settings/org.eclipse.core.resources.prefs
@ -0,0 +1,4 @@
+eclipse.preferences.version=1
+encoding//src/main/java=UTF-8
+encoding//src/main/resources=UTF-8
+encoding/<project>=UTF-8
--- a/demos/samples_java/weather_forcecast_service/.settings/org.eclipse.jdt.apt.core.prefs
+++ b/demos/samples_java/weather_forcecast_service/.settings/org.eclipse.jdt.apt.core.prefs
@ -0,0 +1,2 @@
+eclipse.preferences.version=1
+org.eclipse.jdt.apt.aptEnabled=false
--- a/demos/samples_java/weather_forcecast_service/.settings/org.eclipse.jdt.core.prefs
+++ b/demos/samples_java/weather_forcecast_service/.settings/org.eclipse.jdt.core.prefs
@ -0,0 +1,10 @@
+eclipse.preferences.version=1
+org.eclipse.jdt.core.compiler.codegen.methodParameters=generate
+org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.8
+org.eclipse.jdt.core.compiler.compliance=1.8
+org.eclipse.jdt.core.compiler.problem.enablePreviewFeatures=disabled
+org.eclipse.jdt.core.compiler.problem.forbiddenReference=warning
+org.eclipse.jdt.core.compiler.problem.reportPreviewFeatures=ignore
+org.eclipse.jdt.core.compiler.processAnnotations=disabled
+org.eclipse.jdt.core.compiler.release=disabled
+org.eclipse.jdt.core.compiler.source=1.8
--- a/demos/samples_java/weather_forcecast_service/.settings/org.eclipse.m2e.core.prefs
+++ b/demos/samples_java/weather_forcecast_service/.settings/org.eclipse.m2e.core.prefs
@ -0,0 +1,4 @@
+activeProfiles=
+eclipse.preferences.version=1
+resolveWorkspaceProjects=true
+version=1
--- a/demos/samples_java/weather_forcecast_service/Dockerfile
+++ b/demos/samples_java/weather_forcecast_service/Dockerfile
@ -14,5 +14,10 @@ WORKDIR /app
 # Copy the built jar from the previous stage
 COPY --from=build /app/target/weather-forecast-service-0.0.1-SNAPSHOT.jar app.jar
 # Expose the port on which the app runs (default Spring Boot is 8080)
+
+# Expose the application port and the debug port
 EXPOSE 8081
-ENTRYPOINT ["java", "-jar", "app.jar"]
+EXPOSE 5005
+
+# Start the application with remote debugging enabled
+ENTRYPOINT ["java", "-agentlib:jdwp=transport=dt_socket,server=y,suspend=n,address=*:5005", "-jar", "app.jar"]
--- a/demos/samples_java/weather_forcecast_service/arch_config.yaml
+++ b/demos/samples_java/weather_forcecast_service/arch_config.yaml
@ -1,8 +1,10 @@
 version: v0.1
-listener:
-  address: 127.0.0.1
-  port: 10000 #If you configure port 443, you'll need to update the listener with tls_certificates
-  message_format: huggingface
+listeners:
+  ingress_traffic:
+    address: 0.0.0.0
+    port: 10000
+    message_format: openai
+    timeout: 30s

 # Centralized way to manage LLMs, manage keys, retry logic, failover and limits in a central way
 llm_providers:
@ -43,3 +45,7 @@ prompt_targets:
      name: weather_forecast_service
      path: /weather
      http_method: POST
+
+tracing:
+  random_sampling: 100
+  trace_arch_internal: true
--- a/demos/samples_java/weather_forcecast_service/docker-compose.yaml
+++ b/demos/samples_java/weather_forcecast_service/docker-compose.yaml
@ -5,6 +5,7 @@ services:
      dockerfile: Dockerfile
    ports:
      - "18081:8081"
+      - "5005:5005"

  chatbot_ui:
    build:
@ -18,3 +19,11 @@ services:
      - "host.docker.internal:host-gateway"
    volumes:
      - ./arch_config.yaml:/app/arch_config.yaml
+
+  jaeger:
+    build:
+      context: ../../shared/jaeger
+    ports:
+      - "16686:16686"
+      - "4317:4317"
+      - "4318:4318"
--- a/demos/samples_java/weather_forcecast_service/pom.xml
+++ b/demos/samples_java/weather_forcecast_service/pom.xml
@ -35,6 +35,15 @@
                <groupId>org.springframework.boot</groupId>
                <artifactId>spring-boot-maven-plugin</artifactId>
            </plugin>
+            <plugin>
+                <groupId>org.apache.maven.plugins</groupId>
+                <artifactId>maven-compiler-plugin</artifactId>
+                <version>3.8.1</version>
+                <configuration>
+                    <debug>true</debug>
+                    <debuglevel>lines,vars,source</debuglevel>
+                </configuration>
+            </plugin>
        </plugins>
    </build>
 </project>
--- a/demos/samples_python/currency_exchange/arch_config.yaml
+++ b/demos/samples_python/currency_exchange/arch_config.yaml
@ -1,10 +1,11 @@
 version: v0.1

-listener:
-  address: 0.0.0.0
-  port: 10000
-  message_format: huggingface
-  connect_timeout: 0.005s
+listeners:
+  ingress_traffic:
+    address: 0.0.0.0
+    port: 10000
+    message_format: openai
+    timeout: 30s

 llm_providers:
  - name: gpt-4o
@ -18,7 +19,7 @@ endpoints:
    protocol: https

 system_prompt: |
-  You are a helpful assistant.
+  You are a helpful assistant. Only respond to queries related to currency exchange. If there are any other questions, I can't help you.

 prompt_guards:
  input_guards:
--- a/demos/samples_python/currency_exchange/hurl_tests/simple.hurl
+++ b/demos/samples_python/currency_exchange/hurl_tests/simple.hurl
@ -0,0 +1,19 @@
+POST http://localhost:10000/v1/chat/completions
+Content-Type: application/json
+
+{
+  "messages": [
+    {
+      "role": "user",
+      "content": "convert 100 eur"
+    }
+  ]
+}
+HTTP 200
+[Asserts]
+header "content-type" == "application/json"
+jsonpath "$.model" matches /^gpt-4o/
+jsonpath "$.metadata.x-arch-state" != null
+jsonpath "$.usage" != null
+jsonpath "$.choices[0].message.content" != null
+jsonpath "$.choices[0].message.role" == "assistant"
--- a/demos/samples_python/currency_exchange/hurl_tests/simple_stream.hurl
+++ b/demos/samples_python/currency_exchange/hurl_tests/simple_stream.hurl
@ -0,0 +1,17 @@
+POST http://localhost:10000/v1/chat/completions
+Content-Type: application/json
+
+{
+  "messages": [
+    {
+      "role": "user",
+      "content": "convert 100 eur"
+    }
+  ],
+  "stream": true
+}
+HTTP 200
+[Asserts]
+header "content-type" matches /text\/event-stream/
+body matches /^data: .*?currency_exchange.*?\n/
+body matches /^data: .*?EUR.*?\n/
--- a/demos/samples_python/human_resources_agent/arch_config.yaml
+++ b/demos/samples_python/human_resources_agent/arch_config.yaml
@ -1,8 +1,10 @@
 version: v0.1
-listener:
-  address: 127.0.0.1
-  port: 8080 #If you configure port 443, you'll need to update the listener with tls_certificates
-  message_format: huggingface
+listeners:
+  ingress_traffic:
+    address: 0.0.0.0
+    port: 10000
+    message_format: openai
+    timeout: 30s

 # Centralized way to manage LLMs, manage keys, retry logic, failover and limits in a central way
 llm_providers:
--- a/demos/samples_python/multi_turn_rag_agent/arch_config.yaml
+++ b/demos/samples_python/multi_turn_rag_agent/arch_config.yaml
@ -1,10 +1,11 @@
 version: v0.1

-listener:
-  address: 127.0.0.1
-  port: 10000
-  message_format: huggingface
-  connect_timeout: 0.005s
+listeners:
+  ingress_traffic:
+    address: 0.0.0.0
+    port: 10000
+    message_format: openai
+    timeout: 30s

 endpoints:
  rag_energy_source_agent:
--- a/demos/samples_python/network_switch_operator_agent/README.md
+++ b/demos/samples_python/network_switch_operator_agent/README.md
@ -28,7 +28,7 @@ The assistant can perform several key operations, including rebooting devices, a
 4. Tell me what can you do for me?"

 # Observability
-Arch gateway publishes stats endpoint at http://localhost:19901/stats. In this demo we are using prometheus to pull stats from arch and we are using grafana to visalize the stats in dashboard. To see grafana dashboard follow instructions below,
+Arch gateway publishes stats endpoint at http://localhost:19901/stats. In this demo we are using prometheus to pull stats from arch and we are using grafana to visualize the stats in dashboard. To see grafana dashboard follow instructions below,

 1. Start grafana and prometheus using following command
   ```yaml
--- a/demos/samples_python/network_switch_operator_agent/arch_config.yaml
+++ b/demos/samples_python/network_switch_operator_agent/arch_config.yaml
@ -1,15 +1,17 @@
 version: v0.1
-listener:
-  address: 127.0.0.1
-  port: 8080 #If you configure port 443, you'll need to update the listener with tls_certificates
-  message_format: huggingface
+listeners:
+  ingress_traffic:
+    address: 0.0.0.0
+    port: 10000
+    message_format: openai
+    timeout: 30s

 # Centralized way to manage LLMs, manage keys, retry logic, failover and limits in a central way
 llm_providers:
  - name: OpenAI
    provider_interface: openai
    access_key: $OPENAI_API_KEY
-    model: gpt-3.5-turbo
+    model: gpt-4o
    default: true

 # default system prompt used by all prompt targets
@ -24,25 +26,26 @@ prompt_targets:
      path: /agent/device_summary
      http_method: POST
    parameters:
-      - name: device_ids
-        type: list
-        description: A list of device identifiers (IDs) to retrieve statistics for.
+      - name: device_id
+        type: str
+        description: A device identifier to retrieve statistics for.
        required: true # device_ids are required to get device statistics
      - name: days
        type: int
        description: The number of days for which to gather device statistics.
-        default: "7"
-  - name: reboot_devices
-    description: Reboot a list of devices
+        default: 7
+  - name: reboot_device
+    description: Reboot a device
    endpoint:
      name: app_server
      path: /agent/device_reboot
      http_method: POST
    parameters:
-      - name: device_ids
-        type: list
-        description: A list of device identifiers (IDs).
+      - name: device_id
+        type: str
+        description: the device identifier
        required: true
+    system_prompt: You will get a status JSON object. Simply summarize it

 # Arch creates a round-robin load balancing between different endpoints, managed via the cluster subsystem.
 endpoints:
@ -53,3 +56,8 @@ endpoints:
    endpoint: host.docker.internal:18083
    # max time to wait for a connection to be established
    connect_timeout: 0.005s
+
+
+tracing:
+  random_sampling: 100
+  trace_arch_internal: true
--- a/demos/samples_python/network_switch_operator_agent/docker-compose.yaml
+++ b/demos/samples_python/network_switch_operator_agent/docker-compose.yaml
@ -18,3 +18,11 @@ services:
      - "host.docker.internal:host-gateway"
    volumes:
      - ./arch_config.yaml:/app/arch_config.yaml
+
+  jaeger:
+    build:
+      context: ../../shared/jaeger
+    ports:
+      - "16686:16686"
+      - "4317:4317"
+      - "4318:4318"
--- a/demos/samples_python/network_switch_operator_agent/main.py
+++ b/demos/samples_python/network_switch_operator_agent/main.py
@ -13,7 +13,7 @@ DEMO_DESCRIPTION = """This demo illustrates how **Arch** can be used to perform

 # Define the request model
 class DeviceSummaryRequest(BaseModel):
-    device_ids: List[int]
+    device_id: str
    time_range: Optional[int] = Field(
        default=7, description="Time range in days, defaults to 7"
    )
@ -21,7 +21,7 @@ class DeviceSummaryRequest(BaseModel):

 # Define the response model
 class DeviceStatistics(BaseModel):
-    device_id: int
+    device_id: str
    time_range: str
    data: str

@ -33,7 +33,7 @@ class DeviceSummaryResponse(BaseModel):


 class DeviceRebootRequest(BaseModel):
-    device_ids: List[int]
+    device_id: str


 # Response model for the device reboot
@ -49,24 +49,21 @@ def reboot_network_device(request_data: DeviceRebootRequest):
    """

    # Access data from the Pydantic model
-    device_ids = request_data.device_ids
+    device_id = request_data.device_id

-    # Validate 'device_ids'
+    # Validate 'device_id'
    # (This is already validated by Pydantic, but additional logic can be added if needed)
-    if not device_ids:
-        raise HTTPException(
-            status_code=400, detail="'device_ids' parameter is required"
-        )
+    if not device_id:
+        raise HTTPException(status_code=400, detail="'device_id' parameter is required")

    # Simulate reboot operation and return the response
    statistics = []
-    for device_id in device_ids:
-        # Placeholder for actual data retrieval or device reboot logic
-        stats = {"data": f"Device {device_id} has been successfully rebooted."}
-        statistics.append(stats)
+    # Placeholder for actual data retrieval or device reboot logic
+    stats = {"data": f"Device {device_id} has been successfully rebooted."}
+    statistics.append(stats)

    # Return the response with a summary
-    return CoverageResponse(status="success", summary={"device_ids": device_ids})
+    return CoverageResponse(status="success", summary={"device_id": device_id})


 # Post method for device summary
@ -76,28 +73,20 @@ def get_device_summary(request: DeviceSummaryRequest):
    Endpoint to retrieve device statistics based on device IDs and an optional time range.
    """

-    # Extract 'device_ids' and 'time_range' from the request
-    device_ids = request.device_ids
+    # Extract 'device_id' and 'time_range' from the request
+    device_id = request.device_id
    time_range = request.time_range

    # Simulate retrieving statistics for the given device IDs and time range
    statistics = []
-    minutes = 1
-    for device_id in device_ids:
-        stats = {
-            "device_id": device_id,
-            "time_range": f"Last {time_range} days",
-            "data": f"""Device {device_id} over the last {time_range} days experienced {minutes}
-             minutes of downtime.""",
-        }
-        minutes += 1
-        statistics.append(DeviceStatistics(**stats))
+    minutes = 4
+    stats = {
+        "device_id": device_id,
+        "time_range": f"Last {time_range} days",
+        "data": f"""Device {device_id} over the last {time_range} days experienced {minutes}
+        minutes of downtime.""",
+    }
+
+    statistics.append(DeviceStatistics(**stats))

    return DeviceSummaryResponse(statistics=statistics)
-
-
-CHAT_COMPLETION_ENDPOINT = os.getenv("CHAT_COMPLETION_ENDPOINT")
-client = OpenAI(
-    api_key="--",
-    base_url=CHAT_COMPLETION_ENDPOINT,
-)
--- a/demos/samples_python/network_switch_operator_agent/run_demo.sh
+++ b/demos/samples_python/network_switch_operator_agent/run_demo.sh
@ -22,9 +22,8 @@ start_demo() {
  echo "Starting Arch with arch_config.yaml..."
  archgw up arch_config.yaml

-  # Step 4: Start Network Agent
+  # Step 4: Start developer services
  echo "Starting Network Agent using Docker Compose..."
-  cd build
  docker compose up -d  # Run in detached mode
 }

--- a/demos/samples_python/stock_quote/arch_config.yaml
+++ b/demos/samples_python/stock_quote/arch_config.yaml
@ -1,10 +1,11 @@
 version: v0.1

-listener:
-  address: 0.0.0.0
-  port: 10000
-  message_format: huggingface
-  connect_timeout: 0.005s
+listeners:
+  ingress_traffic:
+    address: 0.0.0.0
+    port: 10000
+    message_format: openai
+    timeout: 30s

 llm_providers:
  - name: gpt-4o
--- a/demos/samples_python/weather_forecast/README.md
+++ b/demos/samples_python/weather_forecast/README.md
@ -1,6 +1,6 @@
 # Function calling

-This demo shows how you can use Arch's core function calling capabilites.
+This demo shows how you can use Arch's core function calling capabilities.

 # Starting the demo

--- a/demos/samples_python/weather_forecast/arch_config.yaml
+++ b/demos/samples_python/weather_forecast/arch_config.yaml
@ -1,10 +1,11 @@
 version: "0.1-beta"

-listener:
-  address: 0.0.0.0
-  port: 10000
-  message_format: huggingface
-  connect_timeout: 0.005s
+listeners:
+  ingress_traffic:
+    address: 0.0.0.0
+    port: 10000
+    message_format: openai
+    timeout: 30s

 endpoints:
  weather_forecast_service:
@ -16,21 +17,17 @@ overrides:
  prompt_target_intent_matching_threshold: 0.6

 llm_providers:
-  - name: gpt-4o-mini
-    access_key: $OPENAI_API_KEY
+  - name: groq
+    access_key: $GROQ_API_KEY
    provider_interface: openai
-    model: gpt-4o-mini
-    default: true
-
-  - name: gpt-3.5-turbo-0125
-    access_key: $OPENAI_API_KEY
-    provider_interface: openai
-    model: gpt-3.5-turbo-0125
+    model: llama-3.2-3b-preview
+    base_url: https://api.groq.com

  - name: gpt-4o
    access_key: $OPENAI_API_KEY
    provider_interface: openai
    model: gpt-4o
+    default: true

 system_prompt: |
  You are a helpful assistant.
--- a/demos/samples_python/weather_forecast/docker-compose.yaml
+++ b/demos/samples_python/weather_forecast/docker-compose.yaml
@ -19,3 +19,5 @@ services:
      - CHAT_COMPLETION_ENDPOINT=http://host.docker.internal:10000/v1
    extra_hosts:
      - "host.docker.internal:host-gateway"
+    volumes:
+      - ./arch_config.yaml:/app/arch_config.yaml
--- a/demos/samples_python/weather_forecast/hurl_tests/simple.hurl
+++ b/demos/samples_python/weather_forecast/hurl_tests/simple.hurl
@ -0,0 +1,19 @@
+POST http://localhost:10000/v1/chat/completions
+Content-Type: application/json
+
+{
+  "messages": [
+    {
+      "role": "user",
+      "content": "how is the weather in seattle for next 5 days"
+    }
+  ]
+}
+HTTP 200
+[Asserts]
+header "content-type" == "application/json"
+jsonpath "$.model" matches /^gpt-4o/
+jsonpath "$.metadata.x-arch-state" != null
+jsonpath "$.usage" != null
+jsonpath "$.choices[0].message.content" matches /Seattle/
+jsonpath "$.choices[0].message.role" == "assistant"
--- a/demos/samples_python/weather_forecast/hurl_tests/simple_stream.hurl
+++ b/demos/samples_python/weather_forecast/hurl_tests/simple_stream.hurl
@ -0,0 +1,17 @@
+POST http://localhost:10000/v1/chat/completions
+Content-Type: application/json
+
+{
+  "messages": [
+    {
+      "role": "user",
+      "content": "how is the weather in seattle for next 5 days"
+    }
+  ],
+  "stream": true
+}
+HTTP 200
+[Asserts]
+header "content-type" matches /text\/event-stream/
+body matches "(?s).*\"name\":\"get_current_weather\".*"
+body matches "(?s).*\"model\":\"gpt-4o-mini.*"
--- a/demos/samples_python/weather_forecast/main.py
+++ b/demos/samples_python/weather_forecast/main.py
@ -73,7 +73,7 @@ async def weather(req: WeatherRequest, res: Response):


 class DefaultTargetRequest(BaseModel):
-    messages: list
+    messages: list = []


@app.post("/default_target")
@ -86,12 +86,9 @@ async def default_target(req: DefaultTargetRequest, res: Response):
                    "role": "assistant",
                    "content": "I can help you with weather forecast",
                },
-                "finish_reason": "completed",
-                "index": 0,
            }
        ],
        "model": "api_server",
-        "usage": {"completion_tokens": 0},
    }
    logger.info(f"sending response: {json.dumps(resp)}")
    return resp
--- a/demos/shared/chatbot_ui/.vscode/launch.json
+++ b/demos/shared/chatbot_ui/.vscode/launch.json
@ -15,7 +15,7 @@
        "LLM": "1",
        "CHAT_COMPLETION_ENDPOINT": "http://localhost:10000/v1",
        "STREAMING": "True",
-        "ARCH_CONFIG": "../../weather_forecast/arch_config.yaml"
+        "ARCH_CONFIG": "../../samples_python/weather_forecast/arch_config.yaml"
      }
    },
    {
@ -29,7 +29,7 @@
        "LLM": "1",
        "CHAT_COMPLETION_ENDPOINT": "http://localhost:12000/v1",
        "STREAMING": "True",
-        "ARCH_CONFIG": "../../llm_routing/arch_config.yaml"
+        "ARCH_CONFIG": "../../samples_python/weather_forecast/arch_config.yaml"
      }
    },
  ]
--- a/demos/shared/chatbot_ui/common.py
+++ b/demos/shared/chatbot_ui/common.py
@ -38,7 +38,7 @@ def chat(
    try:
        response = client.chat.completions.create(
            # we select model from arch_config file
-            model="--",
+            model="None",
            messages=history,
            temperature=1.0,
            stream=True,
@ -120,8 +120,11 @@ def process_stream_chunk(chunk, history):

    if delta.content:
        # append content to the last history item
-        history[-1]["content"] = history[-1].get("content", "") + delta.content
+        if history[-1]["model"] != "Arch-Function-Chat":
+            history[-1]["content"] = history[-1].get("content", "") + delta.content
        # yield content if it is from assistant
+        if history[-1]["model"] == "Arch-Function":
+            return None
        if history[-1]["role"] == "assistant":
            return delta.content

--- a/demos/shared/chatbot_ui/run_stream.py
+++ b/demos/shared/chatbot_ui/run_stream.py
@ -54,13 +54,13 @@ def chat(
        if model_selector and model_selector != "":
            headers["x-arch-llm-provider-hint"] = model_selector
        client = OpenAI(
-            api_key="--",
+            api_key="None",
            base_url=CHAT_COMPLETION_ENDPOINT,
            default_headers=headers,
        )
        response = client.chat.completions.create(
            # we select model from arch_config file
-            model="--",
+            model="None",
            messages=history,
            temperature=1.0,
            stream=True,
@ -88,6 +88,22 @@ def chat(

            yield "", conversation, history, debug_output, model_selector

+    # update assistant response to have correct format
+    # arch-fc 1.1 expects following format:
+    # {
+    #     "response": "<assistant response>",
+    # }
+    # and this entire block needs to be encoded in ```json\n{json_encoded_content}\n```
+
+    if not history[-1]["model"].startswith("Arch"):
+        assistant_response = {
+            "response": history[-1]["content"],
+        }
+        history[-1]["content"] = "```json\n{}\n```".format(
+            json.dumps(assistant_response)
+        )
+    log.info("history: {}".format(json.dumps(history)))
+

 def main():
    with gr.Blocks(
--- a/demos/shared/test_runner/run_demo_tests.sh
+++ b/demos/shared/test_runner/run_demo_tests.sh
@ -8,11 +8,13 @@ do
  echo "Running tests for $demo ..."
  echo "****************************************"
  cd ../../samples_python/$demo
+  echo "starting archgw"
  archgw up arch_config.yaml
-  docker compose up -d
-  cd ../../shared/test_runner
-  TEST_DATA=../../samples_python/$demo/test_data.yaml poetry run pytest
-  cd ../../samples_python/$demo
+  echo "starting docker containers"
+  docker compose up -d 2>&1 > /dev/null
+  echo "starting hurl tests"
+  hurl --test hurl_tests/*.hurl
+  echo "stopping docker containers and archgw"
  archgw down
  docker compose down -v
  cd ../../shared/test_runner
--- a/demos/use_cases/llm_routing/arch_config.yaml
+++ b/demos/use_cases/llm_routing/arch_config.yaml
@ -1,10 +1,11 @@
 version: "0.1-beta"

-listener:
-  address: 0.0.0.0
-  port: 10000
-  message_format: huggingface
-  connect_timeout: 0.005s
+listeners:
+  egress_traffic:
+    address: 0.0.0.0
+    port: 12000
+    message_format: openai
+    timeout: 30s

 llm_providers:
  - name: gpt-4o-mini
@ -13,11 +14,6 @@ llm_providers:
    model: gpt-4o-mini
    default: true

-  - name: gpt-3.5-turbo-0125
-    access_key: $OPENAI_API_KEY
-    provider_interface: openai
-    model: gpt-3.5-turbo-0125
-
  - name: gpt-4o
    access_key: $OPENAI_API_KEY
    provider_interface: openai
@ -28,5 +24,17 @@ llm_providers:
    provider_interface: mistral
    model: ministral-3b-latest

+  - name: deepseek
+    access_key: $DEEPSEEK_API_KEY
+    provider_interface: openai
+    model: deepseek-reasoner
+    base_url: https://api.deepseek.com/
+
+  - name: groq
+    access_key: $GROQ_API_KEY
+    provider_interface: openai
+    model: llama-3.1-8b-instant
+    base_url: https://api.groq.com
+
 tracing:
  random_sampling: 100
--- a/demos/use_cases/ollama/arch_config.yaml
+++ b/demos/use_cases/ollama/arch_config.yaml
@ -1,10 +1,11 @@
 version: v0.1

-listener:
-  address: 0.0.0.0
-  port: 10000
-  message_format: huggingface
-  connect_timeout: 0.005s
+listeners:
+  egress_traffic:
+    address: 0.0.0.0
+    port: 12000
+    message_format: openai
+    timeout: 30s

 llm_providers:

--- a/demos/use_cases/ollama/docker-compose.yaml
+++ b/demos/use_cases/ollama/docker-compose.yaml
@ -6,7 +6,7 @@ services:
      - "18080:8080"
    environment:
      # this is only because we are running the sample app in the same docker container environemtn as archgw
-      - CHAT_COMPLETION_ENDPOINT=http://host.docker.internal:10000/v1
+      - CHAT_COMPLETION_ENDPOINT=http://host.docker.internal:12000/v1
    extra_hosts:
      - "host.docker.internal:host-gateway"
    volumes:
--- a/demos/use_cases/orchestrating_agents/Dockerfile
+++ b/demos/use_cases/orchestrating_agents/Dockerfile
@ -0,0 +1,41 @@
+# took inspiration from https://medium.com/@albertazzir/blazing-fast-python-docker-builds-with-poetry-a78a66f5aed0
+
+# The builder image, used to build the virtual environment
+FROM python:3.10 as builder
+
+RUN pip install poetry==1.8.3
+
+ENV POETRY_NO_INTERACTION=1 \
+    POETRY_VIRTUALENVS_IN_PROJECT=1 \
+    POETRY_VIRTUALENVS_CREATE=1 \
+    POETRY_CACHE_DIR=/tmp/poetry_cache
+
+WORKDIR /code
+
+COPY pyproject.toml poetry.lock ./
+RUN touch README.md
+
+RUN poetry install --no-root && rm -rf $POETRY_CACHE_DIR
+
+# The runtime image, used to just run the code provided its virtual environment
+FROM python:3.10-slim as runtime
+
+RUN apt-get update && apt-get install -y curl
+
+WORKDIR /code
+
+ENV VIRTUAL_ENV=/code/.venv \
+    PATH="/code/.venv/bin:$PATH"
+
+COPY --from=builder ${VIRTUAL_ENV} ${VIRTUAL_ENV}
+
+COPY main.py ./
+
+HEALTHCHECK \
+    --interval=5s \
+    --timeout=1s \
+    --start-period=1s \
+    --retries=3 \
+    CMD curl http://localhost:80/healthz
+
+CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "80", "--log-level", "debug"]
--- a/demos/use_cases/orchestrating_agents/arch_config.yaml
+++ b/demos/use_cases/orchestrating_agents/arch_config.yaml
@ -0,0 +1,46 @@
+version: "0.1-beta"
+
+listeners:
+  ingress_traffic:
+    address: 0.0.0.0
+    port: 10000
+    message_format: openai
+    timeout: 30s
+
+  egress_traffic:
+    address: 0.0.0.0
+    port: 12000
+    message_format: openai
+    timeout: 30s
+
+overrides:
+  use_agent_orchestrator: true
+
+endpoints:
+  agent_gateway:
+    endpoint: host.docker.internal:18083
+    connect_timeout: 0.005s
+
+llm_providers:
+  - name: gpt-4o-mini
+    access_key: $OPENAI_API_KEY
+    provider_interface: openai
+    model: gpt-4o-mini
+    default: true
+
+system_prompt: |
+  You are a helpful assistant.
+
+prompt_targets:
+  - name: sales_agent
+    description: handles queries related to sales and purchases
+
+  - name: issues_and_repairs
+    description: handles issues, repairs, or refunds
+
+  - name: escalate_to_human
+    description: escalates to human agent
+
+tracing:
+  random_sampling: 100
+  trace_arch_internal: true
--- a/demos/use_cases/orchestrating_agents/docker-compose.yaml
+++ b/demos/use_cases/orchestrating_agents/docker-compose.yaml
@ -0,0 +1,29 @@
+services:
+  triage_service:
+    build:
+      context: ./
+    environment:
+      - OLTP_HOST=http://jaeger:4317
+    extra_hosts:
+      - "host.docker.internal:host-gateway"
+    ports:
+      - "18083:80"
+
+  chatbot_ui:
+    build:
+      context: ../../shared/chatbot_ui
+    ports:
+      - "18080:8080"
+    environment:
+      # this is only because we are running the sample app in the same docker container environemtn as archgw
+      - CHAT_COMPLETION_ENDPOINT=http://host.docker.internal:10000/v1
+    extra_hosts:
+      - "host.docker.internal:host-gateway"
+
+  jaeger:
+    build:
+      context: ../../shared/jaeger
+    ports:
+      - "16686:16686"
+      - "4317:4317"
+      - "4318:4318"
--- a/demos/use_cases/orchestrating_agents/hurl_tests/simple_issues_repairs.hurl
+++ b/demos/use_cases/orchestrating_agents/hurl_tests/simple_issues_repairs.hurl
@ -0,0 +1,19 @@
+POST http://localhost:10000/v1/chat/completions
+Content-Type: application/json
+
+{
+  "messages": [
+    {
+      "role": "user",
+      "content": "I bought a package recently and it not working properly"
+    }
+  ]
+}
+HTTP 200
+[Asserts]
+header "content-type" == "application/json"
+jsonpath "$.model" matches /^gpt-4o-2/
+jsonpath "$.metadata.x-arch-state" != null
+jsonpath "$.usage" != null
+jsonpath "$.choices[0].message.content" != null
+jsonpath "$.choices[0].message.role" == "assistant"
--- a/demos/use_cases/orchestrating_agents/hurl_tests/simple_sale_agent.hurl
+++ b/demos/use_cases/orchestrating_agents/hurl_tests/simple_sale_agent.hurl
@ -0,0 +1,19 @@
+POST http://localhost:10000/v1/chat/completions
+Content-Type: application/json
+
+{
+  "messages": [
+    {
+      "role": "user",
+      "content": "I want to sell red shoes"
+    }
+  ]
+}
+HTTP 200
+[Asserts]
+header "content-type" == "application/json"
+jsonpath "$.model" matches /^gpt-4o-mini/
+jsonpath "$.metadata.x-arch-state" != null
+jsonpath "$.usage" != null
+jsonpath "$.choices[0].message.content" != null
+jsonpath "$.choices[0].message.role" == "assistant"
--- a/demos/use_cases/orchestrating_agents/hurl_tests/simple_stream.hurl
+++ b/demos/use_cases/orchestrating_agents/hurl_tests/simple_stream.hurl
@ -0,0 +1,16 @@
+POST http://localhost:10000/v1/chat/completions
+Content-Type: application/json
+
+{
+  "messages": [
+    {
+      "role": "user",
+      "content": "I want to sell red shoes"
+    }
+  ],
+  "stream": true
+}
+HTTP 200
+[Asserts]
+header "content-type" matches /text\/event-stream/
+body matches /^data: .*?sales_agent.*?\n/
--- a/demos/use_cases/orchestrating_agents/main.py
+++ b/demos/use_cases/orchestrating_agents/main.py
@ -0,0 +1,115 @@
+import logging
+import json
+from typing import List, Dict, Any
+from fastapi import FastAPI, Request
+from fastapi.responses import StreamingResponse
+from pydantic import BaseModel
+import openai
+
+# Setup logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger("uvicorn.error")
+
+app = FastAPI()
+
+
+class Message(BaseModel):
+    role: str
+    content: str
+
+
+class ChatCompletionsRequest(BaseModel):
+    messages: List[Message]
+    model: str
+    metadata: Dict[str, Any] = {}
+    stream: bool = False
+
+
+openai_client = openai.OpenAI(
+    api_key="None",  # archgw picks the API key from the config file
+    base_url="http://host.docker.internal:12000/v1",
+)
+
+
+def call_openai(messages: List[Dict[str, str]], stream: bool, model: str):
+    logger.info(f"llm agent model: {model}")
+    completion = openai_client.chat.completions.create(
+        model=model,
+        messages=messages,
+        stream=stream,
+    )
+
+    if stream:
+
+        def stream():
+            for line in completion:
+                if line.choices and len(line.choices) > 0 and line.choices[0].delta:
+                    chunk_response_str = json.dumps(line.model_dump())
+                    yield "data: " + chunk_response_str + "\n\n"
+            yield "data: [DONE]" + "\n\n"
+
+        return StreamingResponse(stream(), media_type="text/event-stream")
+    else:
+        return completion
+
+
+class Agent:
+    def __init__(self, role: str, instructions: str, model: str = ""):
+        self.model = model
+        self.system_prompt = f"You are a {role}.\n{instructions}"
+
+    def handle(self, req: ChatCompletionsRequest):
+        messages = [{"role": "system", "content": self.get_system_prompt()}] + [
+            message.model_dump() for message in req.messages
+        ]
+
+        model = req.model
+        if self.model:
+            model = self.model
+        return call_openai(messages, req.stream, model)
+
+    def get_system_prompt(self) -> str:
+        return self.system_prompt
+
+
+# Define your agents
+AGENTS = {
+    "sales_agent": Agent(
+        role="sales agent",
+        instructions=(
+            "Always answer in a sentence or less.\n"
+            "Follow the following routine with the user:\n"
+            "1. Engage\n"
+            "2. Quote ridiculous price\n"
+            "3. Reveal caveat if user agrees."
+        ),
+        model="gpt-4o-mini",
+    ),
+    "issues_and_repairs": Agent(
+        role="issues and repairs agent",
+        instructions="Propose a solution, offer refund if necessary.",
+        model="gpt-4o",
+    ),
+    "escalate_to_human": Agent(
+        role="human escalation agent",
+        instructions="Escalate issues to a human.",
+        # skipping model name here as arch gateway will pick the default model from the config file
+    ),
+    "unknown_agent": Agent(
+        role="general assistant", instructions="Assist the user in general queries."
+    ),
+}
+
+
+@app.post("/v1/chat/completions")
+def completion_api(req: ChatCompletionsRequest, request: Request):
+    agent_name = req.metadata.get("agent-name", "unknown_agent")
+    agent = AGENTS.get(agent_name)
+    logger.info(f"Routing to agent: {agent_name}")
+
+    return agent.handle(req)
+
+
+@app.get("/healthz")
+async def healthz():
+    return {"status": "ok"}
--- a/demos/use_cases/orchestrating_agents/poetry.lock
+++ b/demos/use_cases/orchestrating_agents/poetry.lock
@ -0,0 +1,573 @@
+# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand.
+
+[[package]]
+name = "annotated-types"
+version = "0.7.0"
+description = "Reusable constraint types to use with typing.Annotated"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "annotated_types-0.7.0-py3-none-any.whl", hash = "sha256:1f02e8b43a8fbbc3f3e0d4f0f4bfc8131bcb4eebe8849b8e5c773f3a1c582a53"},
+    {file = "annotated_types-0.7.0.tar.gz", hash = "sha256:aff07c09a53a08bc8cfccb9c85b05f1aa9a2a6f23728d790723543408344ce89"},
+]
+
+[[package]]
+name = "anyio"
+version = "4.9.0"
+description = "High level compatibility layer for multiple asynchronous event loop implementations"
+optional = false
+python-versions = ">=3.9"
+files = [
+    {file = "anyio-4.9.0-py3-none-any.whl", hash = "sha256:9f76d541cad6e36af7beb62e978876f3b41e3e04f2c1fbf0884604c0a9c4d93c"},
+    {file = "anyio-4.9.0.tar.gz", hash = "sha256:673c0c244e15788651a4ff38710fea9675823028a6f08a5eda409e0c9840a028"},
+]
+
+[package.dependencies]
+exceptiongroup = {version = ">=1.0.2", markers = "python_version < \"3.11\""}
+idna = ">=2.8"
+sniffio = ">=1.1"
+typing_extensions = {version = ">=4.5", markers = "python_version < \"3.13\""}
+
+[package.extras]
+doc = ["Sphinx (>=8.2,<9.0)", "packaging", "sphinx-autodoc-typehints (>=1.2.0)", "sphinx_rtd_theme"]
+test = ["anyio[trio]", "blockbuster (>=1.5.23)", "coverage[toml] (>=7)", "exceptiongroup (>=1.2.0)", "hypothesis (>=4.0)", "psutil (>=5.9)", "pytest (>=7.0)", "trustme", "truststore (>=0.9.1)", "uvloop (>=0.21)"]
+trio = ["trio (>=0.26.1)"]
+
+[[package]]
+name = "certifi"
+version = "2025.1.31"
+description = "Python package for providing Mozilla's CA Bundle."
+optional = false
+python-versions = ">=3.6"
+files = [
+    {file = "certifi-2025.1.31-py3-none-any.whl", hash = "sha256:ca78db4565a652026a4db2bcdf68f2fb589ea80d0be70e03929ed730746b84fe"},
+    {file = "certifi-2025.1.31.tar.gz", hash = "sha256:3d5da6925056f6f18f119200434a4780a94263f10d1c21d032a6f6b2baa20651"},
+]
+
+[[package]]
+name = "click"
+version = "8.1.8"
+description = "Composable command line interface toolkit"
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "click-8.1.8-py3-none-any.whl", hash = "sha256:63c132bbbed01578a06712a2d1f497bb62d9c1c0d329b7903a866228027263b2"},
+    {file = "click-8.1.8.tar.gz", hash = "sha256:ed53c9d8990d83c2a27deae68e4ee337473f6330c040a31d4225c9574d16096a"},
+]
+
+[package.dependencies]
+colorama = {version = "*", markers = "platform_system == \"Windows\""}
+
+[[package]]
+name = "colorama"
+version = "0.4.6"
+description = "Cross-platform colored terminal text."
+optional = false
+python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7"
+files = [
+    {file = "colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6"},
+    {file = "colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44"},
+]
+
+[[package]]
+name = "distro"
+version = "1.9.0"
+description = "Distro - an OS platform information API"
+optional = false
+python-versions = ">=3.6"
+files = [
+    {file = "distro-1.9.0-py3-none-any.whl", hash = "sha256:7bffd925d65168f85027d8da9af6bddab658135b840670a223589bc0c8ef02b2"},
+    {file = "distro-1.9.0.tar.gz", hash = "sha256:2fa77c6fd8940f116ee1d6b94a2f90b13b5ea8d019b98bc8bafdcabcdd9bdbed"},
+]
+
+[[package]]
+name = "exceptiongroup"
+version = "1.2.2"
+description = "Backport of PEP 654 (exception groups)"
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "exceptiongroup-1.2.2-py3-none-any.whl", hash = "sha256:3111b9d131c238bec2f8f516e123e14ba243563fb135d3fe885990585aa7795b"},
+    {file = "exceptiongroup-1.2.2.tar.gz", hash = "sha256:47c2edf7c6738fafb49fd34290706d1a1a2f4d1c6df275526b62cbb4aa5393cc"},
+]
+
+[package.extras]
+test = ["pytest (>=6)"]
+
+[[package]]
+name = "fastapi"
+version = "0.115.11"
+description = "FastAPI framework, high performance, easy to learn, fast to code, ready for production"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "fastapi-0.115.11-py3-none-any.whl", hash = "sha256:32e1541b7b74602e4ef4a0260ecaf3aadf9d4f19590bba3e1bf2ac4666aa2c64"},
+    {file = "fastapi-0.115.11.tar.gz", hash = "sha256:cc81f03f688678b92600a65a5e618b93592c65005db37157147204d8924bf94f"},
+]
+
+[package.dependencies]
+pydantic = ">=1.7.4,<1.8 || >1.8,<1.8.1 || >1.8.1,<2.0.0 || >2.0.0,<2.0.1 || >2.0.1,<2.1.0 || >2.1.0,<3.0.0"
+starlette = ">=0.40.0,<0.47.0"
+typing-extensions = ">=4.8.0"
+
+[package.extras]
+all = ["email-validator (>=2.0.0)", "fastapi-cli[standard] (>=0.0.5)", "httpx (>=0.23.0)", "itsdangerous (>=1.1.0)", "jinja2 (>=3.1.5)", "orjson (>=3.2.1)", "pydantic-extra-types (>=2.0.0)", "pydantic-settings (>=2.0.0)", "python-multipart (>=0.0.18)", "pyyaml (>=5.3.1)", "ujson (>=4.0.1,!=4.0.2,!=4.1.0,!=4.2.0,!=4.3.0,!=5.0.0,!=5.1.0)", "uvicorn[standard] (>=0.12.0)"]
+standard = ["email-validator (>=2.0.0)", "fastapi-cli[standard] (>=0.0.5)", "httpx (>=0.23.0)", "jinja2 (>=3.1.5)", "python-multipart (>=0.0.18)", "uvicorn[standard] (>=0.12.0)"]
+
+[[package]]
+name = "h11"
+version = "0.14.0"
+description = "A pure-Python, bring-your-own-I/O implementation of HTTP/1.1"
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "h11-0.14.0-py3-none-any.whl", hash = "sha256:e3fe4ac4b851c468cc8363d500db52c2ead036020723024a109d37346efaa761"},
+    {file = "h11-0.14.0.tar.gz", hash = "sha256:8f19fbbe99e72420ff35c00b27a34cb9937e902a8b810e2c88300c6f0a3b699d"},
+]
+
+[[package]]
+name = "httpcore"
+version = "1.0.7"
+description = "A minimal low-level HTTP client."
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "httpcore-1.0.7-py3-none-any.whl", hash = "sha256:a3fff8f43dc260d5bd363d9f9cf1830fa3a458b332856f34282de498ed420edd"},
+    {file = "httpcore-1.0.7.tar.gz", hash = "sha256:8551cb62a169ec7162ac7be8d4817d561f60e08eaa485234898414bb5a8a0b4c"},
+]
+
+[package.dependencies]
+certifi = "*"
+h11 = ">=0.13,<0.15"
+
+[package.extras]
+asyncio = ["anyio (>=4.0,<5.0)"]
+http2 = ["h2 (>=3,<5)"]
+socks = ["socksio (==1.*)"]
+trio = ["trio (>=0.22.0,<1.0)"]
+
+[[package]]
+name = "httpx"
+version = "0.28.1"
+description = "The next generation HTTP client."
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "httpx-0.28.1-py3-none-any.whl", hash = "sha256:d909fcccc110f8c7faf814ca82a9a4d816bc5a6dbfea25d6591d6985b8ba59ad"},
+    {file = "httpx-0.28.1.tar.gz", hash = "sha256:75e98c5f16b0f35b567856f597f06ff2270a374470a5c2392242528e3e3e42fc"},
+]
+
+[package.dependencies]
+anyio = "*"
+certifi = "*"
+httpcore = "==1.*"
+idna = "*"
+
+[package.extras]
+brotli = ["brotli", "brotlicffi"]
+cli = ["click (==8.*)", "pygments (==2.*)", "rich (>=10,<14)"]
+http2 = ["h2 (>=3,<5)"]
+socks = ["socksio (==1.*)"]
+zstd = ["zstandard (>=0.18.0)"]
+
+[[package]]
+name = "idna"
+version = "3.10"
+description = "Internationalized Domain Names in Applications (IDNA)"
+optional = false
+python-versions = ">=3.6"
+files = [
+    {file = "idna-3.10-py3-none-any.whl", hash = "sha256:946d195a0d259cbba61165e88e65941f16e9b36ea6ddb97f00452bae8b1287d3"},
+    {file = "idna-3.10.tar.gz", hash = "sha256:12f65c9b470abda6dc35cf8e63cc574b1c52b11df2c86030af0ac09b01b13ea9"},
+]
+
+[package.extras]
+all = ["flake8 (>=7.1.1)", "mypy (>=1.11.2)", "pytest (>=8.3.2)", "ruff (>=0.6.2)"]
+
+[[package]]
+name = "jiter"
+version = "0.9.0"
+description = "Fast iterable JSON parser."
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "jiter-0.9.0-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:816ec9b60fdfd1fec87da1d7ed46c66c44ffec37ab2ef7de5b147b2fce3fd5ad"},
+    {file = "jiter-0.9.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:9b1d3086f8a3ee0194ecf2008cf81286a5c3e540d977fa038ff23576c023c0ea"},
+    {file = "jiter-0.9.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1339f839b91ae30b37c409bf16ccd3dc453e8b8c3ed4bd1d6a567193651a4a51"},
+    {file = "jiter-0.9.0-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:ffba79584b3b670fefae66ceb3a28822365d25b7bf811e030609a3d5b876f538"},
+    {file = "jiter-0.9.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5cfc7d0a8e899089d11f065e289cb5b2daf3d82fbe028f49b20d7b809193958d"},
+    {file = "jiter-0.9.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e00a1a2bbfaaf237e13c3d1592356eab3e9015d7efd59359ac8b51eb56390a12"},
+    {file = "jiter-0.9.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d1d9870561eb26b11448854dce0ff27a9a27cb616b632468cafc938de25e9e51"},
+    {file = "jiter-0.9.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:9872aeff3f21e437651df378cb75aeb7043e5297261222b6441a620218b58708"},
+    {file = "jiter-0.9.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:1fd19112d1049bdd47f17bfbb44a2c0001061312dcf0e72765bfa8abd4aa30e5"},
+    {file = "jiter-0.9.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:6ef5da104664e526836070e4a23b5f68dec1cc673b60bf1edb1bfbe8a55d0678"},
+    {file = "jiter-0.9.0-cp310-cp310-win32.whl", hash = "sha256:cb12e6d65ebbefe5518de819f3eda53b73187b7089040b2d17f5b39001ff31c4"},
+    {file = "jiter-0.9.0-cp310-cp310-win_amd64.whl", hash = "sha256:c43ca669493626d8672be3b645dbb406ef25af3f4b6384cfd306da7eb2e70322"},
+    {file = "jiter-0.9.0-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:6c4d99c71508912a7e556d631768dcdef43648a93660670986916b297f1c54af"},
+    {file = "jiter-0.9.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:8f60fb8ce7df529812bf6c625635a19d27f30806885139e367af93f6e734ef58"},
+    {file = "jiter-0.9.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:51c4e1a4f8ea84d98b7b98912aa4290ac3d1eabfde8e3c34541fae30e9d1f08b"},
+    {file = "jiter-0.9.0-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:5f4c677c424dc76684fea3e7285a7a2a7493424bea89ac441045e6a1fb1d7b3b"},
+    {file = "jiter-0.9.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2221176dfec87f3470b21e6abca056e6b04ce9bff72315cb0b243ca9e835a4b5"},
+    {file = "jiter-0.9.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3c7adb66f899ffa25e3c92bfcb593391ee1947dbdd6a9a970e0d7e713237d572"},
+    {file = "jiter-0.9.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c98d27330fdfb77913c1097a7aab07f38ff2259048949f499c9901700789ac15"},
+    {file = "jiter-0.9.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:eda3f8cc74df66892b1d06b5d41a71670c22d95a1ca2cbab73654745ce9d0419"},
+    {file = "jiter-0.9.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:dd5ab5ddc11418dce28343123644a100f487eaccf1de27a459ab36d6cca31043"},
+    {file = "jiter-0.9.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:42f8a68a69f047b310319ef8e2f52fdb2e7976fb3313ef27df495cf77bcad965"},
+    {file = "jiter-0.9.0-cp311-cp311-win32.whl", hash = "sha256:a25519efb78a42254d59326ee417d6f5161b06f5da827d94cf521fed961b1ff2"},
+    {file = "jiter-0.9.0-cp311-cp311-win_amd64.whl", hash = "sha256:923b54afdd697dfd00d368b7ccad008cccfeb1efb4e621f32860c75e9f25edbd"},
+    {file = "jiter-0.9.0-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:7b46249cfd6c48da28f89eb0be3f52d6fdb40ab88e2c66804f546674e539ec11"},
+    {file = "jiter-0.9.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:609cf3c78852f1189894383cf0b0b977665f54cb38788e3e6b941fa6d982c00e"},
+    {file = "jiter-0.9.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d726a3890a54561e55a9c5faea1f7655eda7f105bd165067575ace6e65f80bb2"},
+    {file = "jiter-0.9.0-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:2e89dc075c1fef8fa9be219e249f14040270dbc507df4215c324a1839522ea75"},
+    {file = "jiter-0.9.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:04e8ffa3c353b1bc4134f96f167a2082494351e42888dfcf06e944f2729cbe1d"},
+    {file = "jiter-0.9.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:203f28a72a05ae0e129b3ed1f75f56bc419d5f91dfacd057519a8bd137b00c42"},
+    {file = "jiter-0.9.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fca1a02ad60ec30bb230f65bc01f611c8608b02d269f998bc29cca8619a919dc"},
+    {file = "jiter-0.9.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:237e5cee4d5d2659aaf91bbf8ec45052cc217d9446070699441a91b386ae27dc"},
+    {file = "jiter-0.9.0-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:528b6b71745e7326eed73c53d4aa57e2a522242320b6f7d65b9c5af83cf49b6e"},
+    {file = "jiter-0.9.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:9f48e86b57bc711eb5acdfd12b6cb580a59cc9a993f6e7dcb6d8b50522dcd50d"},
+    {file = "jiter-0.9.0-cp312-cp312-win32.whl", hash = "sha256:699edfde481e191d81f9cf6d2211debbfe4bd92f06410e7637dffb8dd5dfde06"},
+    {file = "jiter-0.9.0-cp312-cp312-win_amd64.whl", hash = "sha256:099500d07b43f61d8bd780466d429c45a7b25411b334c60ca875fa775f68ccb0"},
+    {file = "jiter-0.9.0-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:2764891d3f3e8b18dce2cff24949153ee30c9239da7c00f032511091ba688ff7"},
+    {file = "jiter-0.9.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:387b22fbfd7a62418d5212b4638026d01723761c75c1c8232a8b8c37c2f1003b"},
+    {file = "jiter-0.9.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:40d8da8629ccae3606c61d9184970423655fb4e33d03330bcdfe52d234d32f69"},
+    {file = "jiter-0.9.0-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:a1be73d8982bdc278b7b9377426a4b44ceb5c7952073dd7488e4ae96b88e1103"},
+    {file = "jiter-0.9.0-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2228eaaaa111ec54b9e89f7481bffb3972e9059301a878d085b2b449fbbde635"},
+    {file = "jiter-0.9.0-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:11509bfecbc319459647d4ac3fd391d26fdf530dad00c13c4dadabf5b81f01a4"},
+    {file = "jiter-0.9.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3f22238da568be8bbd8e0650e12feeb2cfea15eda4f9fc271d3b362a4fa0604d"},
+    {file = "jiter-0.9.0-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:17f5d55eb856597607562257c8e36c42bc87f16bef52ef7129b7da11afc779f3"},
+    {file = "jiter-0.9.0-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:6a99bed9fbb02f5bed416d137944419a69aa4c423e44189bc49718859ea83bc5"},
+    {file = "jiter-0.9.0-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:e057adb0cd1bd39606100be0eafe742de2de88c79df632955b9ab53a086b3c8d"},
+    {file = "jiter-0.9.0-cp313-cp313-win32.whl", hash = "sha256:f7e6850991f3940f62d387ccfa54d1a92bd4bb9f89690b53aea36b4364bcab53"},
+    {file = "jiter-0.9.0-cp313-cp313-win_amd64.whl", hash = "sha256:c8ae3bf27cd1ac5e6e8b7a27487bf3ab5f82318211ec2e1346a5b058756361f7"},
+    {file = "jiter-0.9.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:f0b2827fb88dda2cbecbbc3e596ef08d69bda06c6f57930aec8e79505dc17001"},
+    {file = "jiter-0.9.0-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:062b756ceb1d40b0b28f326cba26cfd575a4918415b036464a52f08632731e5a"},
+    {file = "jiter-0.9.0-cp313-cp313t-win_amd64.whl", hash = "sha256:6f7838bc467ab7e8ef9f387bd6de195c43bad82a569c1699cb822f6609dd4cdf"},
+    {file = "jiter-0.9.0-cp38-cp38-macosx_10_12_x86_64.whl", hash = "sha256:4a2d16360d0642cd68236f931b85fe50288834c383492e4279d9f1792e309571"},
+    {file = "jiter-0.9.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:e84ed1c9c9ec10bbb8c37f450077cbe3c0d4e8c2b19f0a49a60ac7ace73c7452"},
+    {file = "jiter-0.9.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9f3c848209ccd1bfa344a1240763975ca917de753c7875c77ec3034f4151d06c"},
+    {file = "jiter-0.9.0-cp38-cp38-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:7825f46e50646bee937e0f849d14ef3a417910966136f59cd1eb848b8b5bb3e4"},
+    {file = "jiter-0.9.0-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d82a811928b26d1a6311a886b2566f68ccf2b23cf3bfed042e18686f1f22c2d7"},
+    {file = "jiter-0.9.0-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:0c058ecb51763a67f019ae423b1cbe3fa90f7ee6280c31a1baa6ccc0c0e2d06e"},
+    {file = "jiter-0.9.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9897115ad716c48f0120c1f0c4efae348ec47037319a6c63b2d7838bb53aaef4"},
+    {file = "jiter-0.9.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:351f4c90a24c4fb8c87c6a73af2944c440494ed2bea2094feecacb75c50398ae"},
+    {file = "jiter-0.9.0-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:d45807b0f236c485e1e525e2ce3a854807dfe28ccf0d013dd4a563395e28008a"},
+    {file = "jiter-0.9.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:1537a890724ba00fdba21787010ac6f24dad47f763410e9e1093277913592784"},
+    {file = "jiter-0.9.0-cp38-cp38-win32.whl", hash = "sha256:e3630ec20cbeaddd4b65513fa3857e1b7c4190d4481ef07fb63d0fad59033321"},
+    {file = "jiter-0.9.0-cp38-cp38-win_amd64.whl", hash = "sha256:2685f44bf80e95f8910553bf2d33b9c87bf25fceae6e9f0c1355f75d2922b0ee"},
+    {file = "jiter-0.9.0-cp39-cp39-macosx_10_12_x86_64.whl", hash = "sha256:9ef340fae98065071ccd5805fe81c99c8f80484e820e40043689cf97fb66b3e2"},
+    {file = "jiter-0.9.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:efb767d92c63b2cd9ec9f24feeb48f49574a713870ec87e9ba0c2c6e9329c3e2"},
+    {file = "jiter-0.9.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:113f30f87fb1f412510c6d7ed13e91422cfd329436364a690c34c8b8bd880c42"},
+    {file = "jiter-0.9.0-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:8793b6df019b988526f5a633fdc7456ea75e4a79bd8396a3373c371fc59f5c9b"},
+    {file = "jiter-0.9.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:7a9aaa5102dba4e079bb728076fadd5a2dca94c05c04ce68004cfd96f128ea34"},
+    {file = "jiter-0.9.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d838650f6ebaf4ccadfb04522463e74a4c378d7e667e0eb1865cfe3990bfac49"},
+    {file = "jiter-0.9.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c0194f813efdf4b8865ad5f5c5f50f8566df7d770a82c51ef593d09e0b347020"},
+    {file = "jiter-0.9.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:a7954a401d0a8a0b8bc669199db78af435aae1e3569187c2939c477c53cb6a0a"},
+    {file = "jiter-0.9.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:4feafe787eb8a8d98168ab15637ca2577f6ddf77ac6c8c66242c2d028aa5420e"},
+    {file = "jiter-0.9.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:27cd1f2e8bb377f31d3190b34e4328d280325ad7ef55c6ac9abde72f79e84d2e"},
+    {file = "jiter-0.9.0-cp39-cp39-win32.whl", hash = "sha256:161d461dcbe658cf0bd0aa375b30a968b087cdddc624fc585f3867c63c6eca95"},
+    {file = "jiter-0.9.0-cp39-cp39-win_amd64.whl", hash = "sha256:e8b36d8a16a61993be33e75126ad3d8aa29cf450b09576f3c427d27647fcb4aa"},
+    {file = "jiter-0.9.0.tar.gz", hash = "sha256:aadba0964deb424daa24492abc3d229c60c4a31bfee205aedbf1acc7639d7893"},
+]
+
+[[package]]
+name = "openai"
+version = "1.66.5"
+description = "The official Python library for the openai API"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "openai-1.66.5-py3-none-any.whl", hash = "sha256:74be528175f8389f67675830c51a15bd51e874425c86d3de6153bf70ed6c2884"},
+    {file = "openai-1.66.5.tar.gz", hash = "sha256:f61b8fac29490ca8fdc6d996aa6926c18dbe5639536f8c40219c40db05511b11"},
+]
+
+[package.dependencies]
+anyio = ">=3.5.0,<5"
+distro = ">=1.7.0,<2"
+httpx = ">=0.23.0,<1"
+jiter = ">=0.4.0,<1"
+pydantic = ">=1.9.0,<3"
+sniffio = "*"
+tqdm = ">4"
+typing-extensions = ">=4.11,<5"
+
+[package.extras]
+datalib = ["numpy (>=1)", "pandas (>=1.2.3)", "pandas-stubs (>=1.1.0.11)"]
+realtime = ["websockets (>=13,<15)"]
+
+[[package]]
+name = "pydantic"
+version = "2.10.6"
+description = "Data validation using Python type hints"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "pydantic-2.10.6-py3-none-any.whl", hash = "sha256:427d664bf0b8a2b34ff5dd0f5a18df00591adcee7198fbd71981054cef37b584"},
+    {file = "pydantic-2.10.6.tar.gz", hash = "sha256:ca5daa827cce33de7a42be142548b0096bf05a7e7b365aebfa5f8eeec7128236"},
+]
+
+[package.dependencies]
+annotated-types = ">=0.6.0"
+pydantic-core = "2.27.2"
+typing-extensions = ">=4.12.2"
+
+[package.extras]
+email = ["email-validator (>=2.0.0)"]
+timezone = ["tzdata"]
+
+[[package]]
+name = "pydantic-core"
+version = "2.27.2"
+description = "Core functionality for Pydantic validation and serialization"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "pydantic_core-2.27.2-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:2d367ca20b2f14095a8f4fa1210f5a7b78b8a20009ecced6b12818f455b1e9fa"},
+    {file = "pydantic_core-2.27.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:491a2b73db93fab69731eaee494f320faa4e093dbed776be1a829c2eb222c34c"},
+    {file = "pydantic_core-2.27.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7969e133a6f183be60e9f6f56bfae753585680f3b7307a8e555a948d443cc05a"},
+    {file = "pydantic_core-2.27.2-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:3de9961f2a346257caf0aa508a4da705467f53778e9ef6fe744c038119737ef5"},
+    {file = "pydantic_core-2.27.2-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e2bb4d3e5873c37bb3dd58714d4cd0b0e6238cebc4177ac8fe878f8b3aa8e74c"},
+    {file = "pydantic_core-2.27.2-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:280d219beebb0752699480fe8f1dc61ab6615c2046d76b7ab7ee38858de0a4e7"},
+    {file = "pydantic_core-2.27.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:47956ae78b6422cbd46f772f1746799cbb862de838fd8d1fbd34a82e05b0983a"},
+    {file = "pydantic_core-2.27.2-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:14d4a5c49d2f009d62a2a7140d3064f686d17a5d1a268bc641954ba181880236"},
+    {file = "pydantic_core-2.27.2-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:337b443af21d488716f8d0b6164de833e788aa6bd7e3a39c005febc1284f4962"},
+    {file = "pydantic_core-2.27.2-cp310-cp310-musllinux_1_1_armv7l.whl", hash = "sha256:03d0f86ea3184a12f41a2d23f7ccb79cdb5a18e06993f8a45baa8dfec746f0e9"},
+    {file = "pydantic_core-2.27.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:7041c36f5680c6e0f08d922aed302e98b3745d97fe1589db0a3eebf6624523af"},
+    {file = "pydantic_core-2.27.2-cp310-cp310-win32.whl", hash = "sha256:50a68f3e3819077be2c98110c1f9dcb3817e93f267ba80a2c05bb4f8799e2ff4"},
+    {file = "pydantic_core-2.27.2-cp310-cp310-win_amd64.whl", hash = "sha256:e0fd26b16394ead34a424eecf8a31a1f5137094cabe84a1bcb10fa6ba39d3d31"},
+    {file = "pydantic_core-2.27.2-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:8e10c99ef58cfdf2a66fc15d66b16c4a04f62bca39db589ae8cba08bc55331bc"},
+    {file = "pydantic_core-2.27.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:26f32e0adf166a84d0cb63be85c562ca8a6fa8de28e5f0d92250c6b7e9e2aff7"},
+    {file = "pydantic_core-2.27.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8c19d1ea0673cd13cc2f872f6c9ab42acc4e4f492a7ca9d3795ce2b112dd7e15"},
+    {file = "pydantic_core-2.27.2-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:5e68c4446fe0810e959cdff46ab0a41ce2f2c86d227d96dc3847af0ba7def306"},
+    {file = "pydantic_core-2.27.2-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d9640b0059ff4f14d1f37321b94061c6db164fbe49b334b31643e0528d100d99"},
+    {file = "pydantic_core-2.27.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:40d02e7d45c9f8af700f3452f329ead92da4c5f4317ca9b896de7ce7199ea459"},
+    {file = "pydantic_core-2.27.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1c1fd185014191700554795c99b347d64f2bb637966c4cfc16998a0ca700d048"},
+    {file = "pydantic_core-2.27.2-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:d81d2068e1c1228a565af076598f9e7451712700b673de8f502f0334f281387d"},
+    {file = "pydantic_core-2.27.2-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:1a4207639fb02ec2dbb76227d7c751a20b1a6b4bc52850568e52260cae64ca3b"},
+    {file = "pydantic_core-2.27.2-cp311-cp311-musllinux_1_1_armv7l.whl", hash = "sha256:3de3ce3c9ddc8bbd88f6e0e304dea0e66d843ec9de1b0042b0911c1663ffd474"},
+    {file = "pydantic_core-2.27.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:30c5f68ded0c36466acede341551106821043e9afaad516adfb6e8fa80a4e6a6"},
+    {file = "pydantic_core-2.27.2-cp311-cp311-win32.whl", hash = "sha256:c70c26d2c99f78b125a3459f8afe1aed4d9687c24fd677c6a4436bc042e50d6c"},
+    {file = "pydantic_core-2.27.2-cp311-cp311-win_amd64.whl", hash = "sha256:08e125dbdc505fa69ca7d9c499639ab6407cfa909214d500897d02afb816e7cc"},
+    {file = "pydantic_core-2.27.2-cp311-cp311-win_arm64.whl", hash = "sha256:26f0d68d4b235a2bae0c3fc585c585b4ecc51382db0e3ba402a22cbc440915e4"},
+    {file = "pydantic_core-2.27.2-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:9e0c8cfefa0ef83b4da9588448b6d8d2a2bf1a53c3f1ae5fca39eb3061e2f0b0"},
+    {file = "pydantic_core-2.27.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:83097677b8e3bd7eaa6775720ec8e0405f1575015a463285a92bfdfe254529ef"},
+    {file = "pydantic_core-2.27.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:172fce187655fece0c90d90a678424b013f8fbb0ca8b036ac266749c09438cb7"},
+    {file = "pydantic_core-2.27.2-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:519f29f5213271eeeeb3093f662ba2fd512b91c5f188f3bb7b27bc5973816934"},
+    {file = "pydantic_core-2.27.2-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:05e3a55d124407fffba0dd6b0c0cd056d10e983ceb4e5dbd10dda135c31071d6"},
+    {file = "pydantic_core-2.27.2-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9c3ed807c7b91de05e63930188f19e921d1fe90de6b4f5cd43ee7fcc3525cb8c"},
+    {file = "pydantic_core-2.27.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6fb4aadc0b9a0c063206846d603b92030eb6f03069151a625667f982887153e2"},
+    {file = "pydantic_core-2.27.2-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:28ccb213807e037460326424ceb8b5245acb88f32f3d2777427476e1b32c48c4"},
+    {file = "pydantic_core-2.27.2-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:de3cd1899e2c279b140adde9357c4495ed9d47131b4a4eaff9052f23398076b3"},
+    {file = "pydantic_core-2.27.2-cp312-cp312-musllinux_1_1_armv7l.whl", hash = "sha256:220f892729375e2d736b97d0e51466252ad84c51857d4d15f5e9692f9ef12be4"},
+    {file = "pydantic_core-2.27.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:a0fcd29cd6b4e74fe8ddd2c90330fd8edf2e30cb52acda47f06dd615ae72da57"},
+    {file = "pydantic_core-2.27.2-cp312-cp312-win32.whl", hash = "sha256:1e2cb691ed9834cd6a8be61228471d0a503731abfb42f82458ff27be7b2186fc"},
+    {file = "pydantic_core-2.27.2-cp312-cp312-win_amd64.whl", hash = "sha256:cc3f1a99a4f4f9dd1de4fe0312c114e740b5ddead65bb4102884b384c15d8bc9"},
+    {file = "pydantic_core-2.27.2-cp312-cp312-win_arm64.whl", hash = "sha256:3911ac9284cd8a1792d3cb26a2da18f3ca26c6908cc434a18f730dc0db7bfa3b"},
+    {file = "pydantic_core-2.27.2-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:7d14bd329640e63852364c306f4d23eb744e0f8193148d4044dd3dacdaacbd8b"},
+    {file = "pydantic_core-2.27.2-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:82f91663004eb8ed30ff478d77c4d1179b3563df6cdb15c0817cd1cdaf34d154"},
+    {file = "pydantic_core-2.27.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:71b24c7d61131bb83df10cc7e687433609963a944ccf45190cfc21e0887b08c9"},
+    {file = "pydantic_core-2.27.2-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:fa8e459d4954f608fa26116118bb67f56b93b209c39b008277ace29937453dc9"},
+    {file = "pydantic_core-2.27.2-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ce8918cbebc8da707ba805b7fd0b382816858728ae7fe19a942080c24e5b7cd1"},
+    {file = "pydantic_core-2.27.2-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:eda3f5c2a021bbc5d976107bb302e0131351c2ba54343f8a496dc8783d3d3a6a"},
+    {file = "pydantic_core-2.27.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bd8086fa684c4775c27f03f062cbb9eaa6e17f064307e86b21b9e0abc9c0f02e"},
+    {file = "pydantic_core-2.27.2-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:8d9b3388db186ba0c099a6d20f0604a44eabdeef1777ddd94786cdae158729e4"},
+    {file = "pydantic_core-2.27.2-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:7a66efda2387de898c8f38c0cf7f14fca0b51a8ef0b24bfea5849f1b3c95af27"},
+    {file = "pydantic_core-2.27.2-cp313-cp313-musllinux_1_1_armv7l.whl", hash = "sha256:18a101c168e4e092ab40dbc2503bdc0f62010e95d292b27827871dc85450d7ee"},
+    {file = "pydantic_core-2.27.2-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:ba5dd002f88b78a4215ed2f8ddbdf85e8513382820ba15ad5ad8955ce0ca19a1"},
+    {file = "pydantic_core-2.27.2-cp313-cp313-win32.whl", hash = "sha256:1ebaf1d0481914d004a573394f4be3a7616334be70261007e47c2a6fe7e50130"},
+    {file = "pydantic_core-2.27.2-cp313-cp313-win_amd64.whl", hash = "sha256:953101387ecf2f5652883208769a79e48db18c6df442568a0b5ccd8c2723abee"},
+    {file = "pydantic_core-2.27.2-cp313-cp313-win_arm64.whl", hash = "sha256:ac4dbfd1691affb8f48c2c13241a2e3b60ff23247cbcf981759c768b6633cf8b"},
+    {file = "pydantic_core-2.27.2-cp38-cp38-macosx_10_12_x86_64.whl", hash = "sha256:d3e8d504bdd3f10835468f29008d72fc8359d95c9c415ce6e767203db6127506"},
+    {file = "pydantic_core-2.27.2-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:521eb9b7f036c9b6187f0b47318ab0d7ca14bd87f776240b90b21c1f4f149320"},
+    {file = "pydantic_core-2.27.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:85210c4d99a0114f5a9481b44560d7d1e35e32cc5634c656bc48e590b669b145"},
+    {file = "pydantic_core-2.27.2-cp38-cp38-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:d716e2e30c6f140d7560ef1538953a5cd1a87264c737643d481f2779fc247fe1"},
+    {file = "pydantic_core-2.27.2-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f66d89ba397d92f840f8654756196d93804278457b5fbede59598a1f9f90b228"},
+    {file = "pydantic_core-2.27.2-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:669e193c1c576a58f132e3158f9dfa9662969edb1a250c54d8fa52590045f046"},
+    {file = "pydantic_core-2.27.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9fdbe7629b996647b99c01b37f11170a57ae675375b14b8c13b8518b8320ced5"},
+    {file = "pydantic_core-2.27.2-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:d262606bf386a5ba0b0af3b97f37c83d7011439e3dc1a9298f21efb292e42f1a"},
+    {file = "pydantic_core-2.27.2-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:cabb9bcb7e0d97f74df8646f34fc76fbf793b7f6dc2438517d7a9e50eee4f14d"},
+    {file = "pydantic_core-2.27.2-cp38-cp38-musllinux_1_1_armv7l.whl", hash = "sha256:d2d63f1215638d28221f664596b1ccb3944f6e25dd18cd3b86b0a4c408d5ebb9"},
+    {file = "pydantic_core-2.27.2-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:bca101c00bff0adb45a833f8451b9105d9df18accb8743b08107d7ada14bd7da"},
+    {file = "pydantic_core-2.27.2-cp38-cp38-win32.whl", hash = "sha256:f6f8e111843bbb0dee4cb6594cdc73e79b3329b526037ec242a3e49012495b3b"},
+    {file = "pydantic_core-2.27.2-cp38-cp38-win_amd64.whl", hash = "sha256:fd1aea04935a508f62e0d0ef1f5ae968774a32afc306fb8545e06f5ff5cdf3ad"},
+    {file = "pydantic_core-2.27.2-cp39-cp39-macosx_10_12_x86_64.whl", hash = "sha256:c10eb4f1659290b523af58fa7cffb452a61ad6ae5613404519aee4bfbf1df993"},
+    {file = "pydantic_core-2.27.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:ef592d4bad47296fb11f96cd7dc898b92e795032b4894dfb4076cfccd43a9308"},
+    {file = "pydantic_core-2.27.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c61709a844acc6bf0b7dce7daae75195a10aac96a596ea1b776996414791ede4"},
+    {file = "pydantic_core-2.27.2-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:42c5f762659e47fdb7b16956c71598292f60a03aa92f8b6351504359dbdba6cf"},
+    {file = "pydantic_core-2.27.2-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:4c9775e339e42e79ec99c441d9730fccf07414af63eac2f0e48e08fd38a64d76"},
+    {file = "pydantic_core-2.27.2-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:57762139821c31847cfb2df63c12f725788bd9f04bc2fb392790959b8f70f118"},
+    {file = "pydantic_core-2.27.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0d1e85068e818c73e048fe28cfc769040bb1f475524f4745a5dc621f75ac7630"},
+    {file = "pydantic_core-2.27.2-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:097830ed52fd9e427942ff3b9bc17fab52913b2f50f2880dc4a5611446606a54"},
+    {file = "pydantic_core-2.27.2-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:044a50963a614ecfae59bb1eaf7ea7efc4bc62f49ed594e18fa1e5d953c40e9f"},
+    {file = "pydantic_core-2.27.2-cp39-cp39-musllinux_1_1_armv7l.whl", hash = "sha256:4e0b4220ba5b40d727c7f879eac379b822eee5d8fff418e9d3381ee45b3b0362"},
+    {file = "pydantic_core-2.27.2-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:5e4f4bb20d75e9325cc9696c6802657b58bc1dbbe3022f32cc2b2b632c3fbb96"},
+    {file = "pydantic_core-2.27.2-cp39-cp39-win32.whl", hash = "sha256:cca63613e90d001b9f2f9a9ceb276c308bfa2a43fafb75c8031c4f66039e8c6e"},
+    {file = "pydantic_core-2.27.2-cp39-cp39-win_amd64.whl", hash = "sha256:77d1bca19b0f7021b3a982e6f903dcd5b2b06076def36a652e3907f596e29f67"},
+    {file = "pydantic_core-2.27.2-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:2bf14caea37e91198329b828eae1618c068dfb8ef17bb33287a7ad4b61ac314e"},
+    {file = "pydantic_core-2.27.2-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:b0cb791f5b45307caae8810c2023a184c74605ec3bcbb67d13846c28ff731ff8"},
+    {file = "pydantic_core-2.27.2-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:688d3fd9fcb71f41c4c015c023d12a79d1c4c0732ec9eb35d96e3388a120dcf3"},
+    {file = "pydantic_core-2.27.2-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3d591580c34f4d731592f0e9fe40f9cc1b430d297eecc70b962e93c5c668f15f"},
+    {file = "pydantic_core-2.27.2-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:82f986faf4e644ffc189a7f1aafc86e46ef70372bb153e7001e8afccc6e54133"},
+    {file = "pydantic_core-2.27.2-pp310-pypy310_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:bec317a27290e2537f922639cafd54990551725fc844249e64c523301d0822fc"},
+    {file = "pydantic_core-2.27.2-pp310-pypy310_pp73-musllinux_1_1_armv7l.whl", hash = "sha256:0296abcb83a797db256b773f45773da397da75a08f5fcaef41f2044adec05f50"},
+    {file = "pydantic_core-2.27.2-pp310-pypy310_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:0d75070718e369e452075a6017fbf187f788e17ed67a3abd47fa934d001863d9"},
+    {file = "pydantic_core-2.27.2-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:7e17b560be3c98a8e3aa66ce828bdebb9e9ac6ad5466fba92eb74c4c95cb1151"},
+    {file = "pydantic_core-2.27.2-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:c33939a82924da9ed65dab5a65d427205a73181d8098e79b6b426bdf8ad4e656"},
+    {file = "pydantic_core-2.27.2-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:00bad2484fa6bda1e216e7345a798bd37c68fb2d97558edd584942aa41b7d278"},
+    {file = "pydantic_core-2.27.2-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c817e2b40aba42bac6f457498dacabc568c3b7a986fc9ba7c8d9d260b71485fb"},
+    {file = "pydantic_core-2.27.2-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:251136cdad0cb722e93732cb45ca5299fb56e1344a833640bf93b2803f8d1bfd"},
+    {file = "pydantic_core-2.27.2-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:d2088237af596f0a524d3afc39ab3b036e8adb054ee57cbb1dcf8e09da5b29cc"},
+    {file = "pydantic_core-2.27.2-pp39-pypy39_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:d4041c0b966a84b4ae7a09832eb691a35aec90910cd2dbe7a208de59be77965b"},
+    {file = "pydantic_core-2.27.2-pp39-pypy39_pp73-musllinux_1_1_armv7l.whl", hash = "sha256:8083d4e875ebe0b864ffef72a4304827015cff328a1be6e22cc850753bfb122b"},
+    {file = "pydantic_core-2.27.2-pp39-pypy39_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:f141ee28a0ad2123b6611b6ceff018039df17f32ada8b534e6aa039545a3efb2"},
+    {file = "pydantic_core-2.27.2-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:7d0c8399fcc1848491f00e0314bd59fb34a9c008761bcb422a057670c3f65e35"},
+    {file = "pydantic_core-2.27.2.tar.gz", hash = "sha256:eb026e5a4c1fee05726072337ff51d1efb6f59090b7da90d30ea58625b1ffb39"},
+]
+
+[package.dependencies]
+typing-extensions = ">=4.6.0,<4.7.0 || >4.7.0"
+
+[[package]]
+name = "pyyaml"
+version = "6.0.2"
+description = "YAML parser and emitter for Python"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "PyYAML-6.0.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:0a9a2848a5b7feac301353437eb7d5957887edbf81d56e903999a75a3d743086"},
+    {file = "PyYAML-6.0.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:29717114e51c84ddfba879543fb232a6ed60086602313ca38cce623c1d62cfbf"},
+    {file = "PyYAML-6.0.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8824b5a04a04a047e72eea5cec3bc266db09e35de6bdfe34c9436ac5ee27d237"},
+    {file = "PyYAML-6.0.2-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7c36280e6fb8385e520936c3cb3b8042851904eba0e58d277dca80a5cfed590b"},
+    {file = "PyYAML-6.0.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ec031d5d2feb36d1d1a24380e4db6d43695f3748343d99434e6f5f9156aaa2ed"},
+    {file = "PyYAML-6.0.2-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:936d68689298c36b53b29f23c6dbb74de12b4ac12ca6cfe0e047bedceea56180"},
+    {file = "PyYAML-6.0.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:23502f431948090f597378482b4812b0caae32c22213aecf3b55325e049a6c68"},
+    {file = "PyYAML-6.0.2-cp310-cp310-win32.whl", hash = "sha256:2e99c6826ffa974fe6e27cdb5ed0021786b03fc98e5ee3c5bfe1fd5015f42b99"},
+    {file = "PyYAML-6.0.2-cp310-cp310-win_amd64.whl", hash = "sha256:a4d3091415f010369ae4ed1fc6b79def9416358877534caf6a0fdd2146c87a3e"},
+    {file = "PyYAML-6.0.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:cc1c1159b3d456576af7a3e4d1ba7e6924cb39de8f67111c735f6fc832082774"},
+    {file = "PyYAML-6.0.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:1e2120ef853f59c7419231f3bf4e7021f1b936f6ebd222406c3b60212205d2ee"},
+    {file = "PyYAML-6.0.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5d225db5a45f21e78dd9358e58a98702a0302f2659a3c6cd320564b75b86f47c"},
+    {file = "PyYAML-6.0.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5ac9328ec4831237bec75defaf839f7d4564be1e6b25ac710bd1a96321cc8317"},
+    {file = "PyYAML-6.0.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3ad2a3decf9aaba3d29c8f537ac4b243e36bef957511b4766cb0057d32b0be85"},
+    {file = "PyYAML-6.0.2-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:ff3824dc5261f50c9b0dfb3be22b4567a6f938ccce4587b38952d85fd9e9afe4"},
+    {file = "PyYAML-6.0.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:797b4f722ffa07cc8d62053e4cff1486fa6dc094105d13fea7b1de7d8bf71c9e"},
+    {file = "PyYAML-6.0.2-cp311-cp311-win32.whl", hash = "sha256:11d8f3dd2b9c1207dcaf2ee0bbbfd5991f571186ec9cc78427ba5bd32afae4b5"},
+    {file = "PyYAML-6.0.2-cp311-cp311-win_amd64.whl", hash = "sha256:e10ce637b18caea04431ce14fabcf5c64a1c61ec9c56b071a4b7ca131ca52d44"},
+    {file = "PyYAML-6.0.2-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:c70c95198c015b85feafc136515252a261a84561b7b1d51e3384e0655ddf25ab"},
+    {file = "PyYAML-6.0.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:ce826d6ef20b1bc864f0a68340c8b3287705cae2f8b4b1d932177dcc76721725"},
+    {file = "PyYAML-6.0.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1f71ea527786de97d1a0cc0eacd1defc0985dcf6b3f17bb77dcfc8c34bec4dc5"},
+    {file = "PyYAML-6.0.2-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9b22676e8097e9e22e36d6b7bda33190d0d400f345f23d4065d48f4ca7ae0425"},
+    {file = "PyYAML-6.0.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:80bab7bfc629882493af4aa31a4cfa43a4c57c83813253626916b8c7ada83476"},
+    {file = "PyYAML-6.0.2-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:0833f8694549e586547b576dcfaba4a6b55b9e96098b36cdc7ebefe667dfed48"},
+    {file = "PyYAML-6.0.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8b9c7197f7cb2738065c481a0461e50ad02f18c78cd75775628afb4d7137fb3b"},
+    {file = "PyYAML-6.0.2-cp312-cp312-win32.whl", hash = "sha256:ef6107725bd54b262d6dedcc2af448a266975032bc85ef0172c5f059da6325b4"},
+    {file = "PyYAML-6.0.2-cp312-cp312-win_amd64.whl", hash = "sha256:7e7401d0de89a9a855c839bc697c079a4af81cf878373abd7dc625847d25cbd8"},
+    {file = "PyYAML-6.0.2-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:efdca5630322a10774e8e98e1af481aad470dd62c3170801852d752aa7a783ba"},
+    {file = "PyYAML-6.0.2-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:50187695423ffe49e2deacb8cd10510bc361faac997de9efef88badc3bb9e2d1"},
+    {file = "PyYAML-6.0.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0ffe8360bab4910ef1b9e87fb812d8bc0a308b0d0eef8c8f44e0254ab3b07133"},
+    {file = "PyYAML-6.0.2-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:17e311b6c678207928d649faa7cb0d7b4c26a0ba73d41e99c4fff6b6c3276484"},
+    {file = "PyYAML-6.0.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:70b189594dbe54f75ab3a1acec5f1e3faa7e8cf2f1e08d9b561cb41b845f69d5"},
+    {file = "PyYAML-6.0.2-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:41e4e3953a79407c794916fa277a82531dd93aad34e29c2a514c2c0c5fe971cc"},
+    {file = "PyYAML-6.0.2-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:68ccc6023a3400877818152ad9a1033e3db8625d899c72eacb5a668902e4d652"},
+    {file = "PyYAML-6.0.2-cp313-cp313-win32.whl", hash = "sha256:bc2fa7c6b47d6bc618dd7fb02ef6fdedb1090ec036abab80d4681424b84c1183"},
+    {file = "PyYAML-6.0.2-cp313-cp313-win_amd64.whl", hash = "sha256:8388ee1976c416731879ac16da0aff3f63b286ffdd57cdeb95f3f2e085687563"},
+    {file = "PyYAML-6.0.2-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:24471b829b3bf607e04e88d79542a9d48bb037c2267d7927a874e6c205ca7e9a"},
+    {file = "PyYAML-6.0.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d7fded462629cfa4b685c5416b949ebad6cec74af5e2d42905d41e257e0869f5"},
+    {file = "PyYAML-6.0.2-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d84a1718ee396f54f3a086ea0a66d8e552b2ab2017ef8b420e92edbc841c352d"},
+    {file = "PyYAML-6.0.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9056c1ecd25795207ad294bcf39f2db3d845767be0ea6e6a34d856f006006083"},
+    {file = "PyYAML-6.0.2-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:82d09873e40955485746739bcb8b4586983670466c23382c19cffecbf1fd8706"},
+    {file = "PyYAML-6.0.2-cp38-cp38-win32.whl", hash = "sha256:43fa96a3ca0d6b1812e01ced1044a003533c47f6ee8aca31724f78e93ccc089a"},
+    {file = "PyYAML-6.0.2-cp38-cp38-win_amd64.whl", hash = "sha256:01179a4a8559ab5de078078f37e5c1a30d76bb88519906844fd7bdea1b7729ff"},
+    {file = "PyYAML-6.0.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:688ba32a1cffef67fd2e9398a2efebaea461578b0923624778664cc1c914db5d"},
+    {file = "PyYAML-6.0.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:a8786accb172bd8afb8be14490a16625cbc387036876ab6ba70912730faf8e1f"},
+    {file = "PyYAML-6.0.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d8e03406cac8513435335dbab54c0d385e4a49e4945d2909a581c83647ca0290"},
+    {file = "PyYAML-6.0.2-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f753120cb8181e736c57ef7636e83f31b9c0d1722c516f7e86cf15b7aa57ff12"},
+    {file = "PyYAML-6.0.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3b1fdb9dc17f5a7677423d508ab4f243a726dea51fa5e70992e59a7411c89d19"},
+    {file = "PyYAML-6.0.2-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:0b69e4ce7a131fe56b7e4d770c67429700908fc0752af059838b1cfb41960e4e"},
+    {file = "PyYAML-6.0.2-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:a9f8c2e67970f13b16084e04f134610fd1d374bf477b17ec1599185cf611d725"},
+    {file = "PyYAML-6.0.2-cp39-cp39-win32.whl", hash = "sha256:6395c297d42274772abc367baaa79683958044e5d3835486c16da75d2a694631"},
+    {file = "PyYAML-6.0.2-cp39-cp39-win_amd64.whl", hash = "sha256:39693e1f8320ae4f43943590b49779ffb98acb81f788220ea932a6b6c51004d8"},
+    {file = "pyyaml-6.0.2.tar.gz", hash = "sha256:d584d9ec91ad65861cc08d42e834324ef890a082e591037abe114850ff7bbc3e"},
+]
+
+[[package]]
+name = "sniffio"
+version = "1.3.1"
+description = "Sniff out which async library your code is running under"
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "sniffio-1.3.1-py3-none-any.whl", hash = "sha256:2f6da418d1f1e0fddd844478f41680e794e6051915791a034ff65e5f100525a2"},
+    {file = "sniffio-1.3.1.tar.gz", hash = "sha256:f4324edc670a0f49750a81b895f35c3adb843cca46f0530f79fc1babb23789dc"},
+]
+
+[[package]]
+name = "starlette"
+version = "0.46.1"
+description = "The little ASGI library that shines."
+optional = false
+python-versions = ">=3.9"
+files = [
+    {file = "starlette-0.46.1-py3-none-any.whl", hash = "sha256:77c74ed9d2720138b25875133f3a2dae6d854af2ec37dceb56aef370c1d8a227"},
+    {file = "starlette-0.46.1.tar.gz", hash = "sha256:3c88d58ee4bd1bb807c0d1acb381838afc7752f9ddaec81bbe4383611d833230"},
+]
+
+[package.dependencies]
+anyio = ">=3.6.2,<5"
+
+[package.extras]
+full = ["httpx (>=0.27.0,<0.29.0)", "itsdangerous", "jinja2", "python-multipart (>=0.0.18)", "pyyaml"]
+
+[[package]]
+name = "tqdm"
+version = "4.67.1"
+description = "Fast, Extensible Progress Meter"
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "tqdm-4.67.1-py3-none-any.whl", hash = "sha256:26445eca388f82e72884e0d580d5464cd801a3ea01e63e5601bdff9ba6a48de2"},
+    {file = "tqdm-4.67.1.tar.gz", hash = "sha256:f8aef9c52c08c13a65f30ea34f4e5aac3fd1a34959879d7e59e63027286627f2"},
+]
+
+[package.dependencies]
+colorama = {version = "*", markers = "platform_system == \"Windows\""}
+
+[package.extras]
+dev = ["nbval", "pytest (>=6)", "pytest-asyncio (>=0.24)", "pytest-cov", "pytest-timeout"]
+discord = ["requests"]
+notebook = ["ipywidgets (>=6)"]
+slack = ["slack-sdk"]
+telegram = ["requests"]
+
+[[package]]
+name = "typing-extensions"
+version = "4.12.2"
+description = "Backported and Experimental Type Hints for Python 3.8+"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "typing_extensions-4.12.2-py3-none-any.whl", hash = "sha256:04e5ca0351e0f3f85c6853954072df659d0d13fac324d0072316b67d7794700d"},
+    {file = "typing_extensions-4.12.2.tar.gz", hash = "sha256:1a7ead55c7e559dd4dee8856e3a88b41225abfe1ce8df57b7c13915fe121ffb8"},
+]
+
+[[package]]
+name = "uvicorn"
+version = "0.34.0"
+description = "The lightning-fast ASGI server."
+optional = false
+python-versions = ">=3.9"
+files = [
+    {file = "uvicorn-0.34.0-py3-none-any.whl", hash = "sha256:023dc038422502fa28a09c7a30bf2b6991512da7dcdb8fd35fe57cfc154126f4"},
+    {file = "uvicorn-0.34.0.tar.gz", hash = "sha256:404051050cd7e905de2c9a7e61790943440b3416f49cb409f965d9dcd0fa73e9"},
+]
+
+[package.dependencies]
+click = ">=7.0"
+h11 = ">=0.8"
+typing-extensions = {version = ">=4.0", markers = "python_version < \"3.11\""}
+
+[package.extras]
+standard = ["colorama (>=0.4)", "httptools (>=0.6.3)", "python-dotenv (>=0.13)", "pyyaml (>=5.1)", "uvloop (>=0.14.0,!=0.15.0,!=0.15.1)", "watchfiles (>=0.13)", "websockets (>=10.4)"]
+
+[metadata]
+lock-version = "2.0"
+python-versions = "^3.10"
+content-hash = "d005d82268b6f8c2a68b26c454bced5c34bf3c971c0cbfefde3fc0c45c675f55"
--- a/demos/use_cases/orchestrating_agents/pyproject.toml
+++ b/demos/use_cases/orchestrating_agents/pyproject.toml
@ -0,0 +1,20 @@
+[tool.poetry]
+name = "api-server"
+version = "0.1.0"
+description = ""
+authors = ["Adil Hafeez <info@katanemo.com>"]
+readme = "README.md"
+
+[tool.poetry.dependencies]
+python = "^3.10"
+fastapi = "^0.115.4"
+pyyaml = "^6.0.2"
+uvicorn = "^0.34.0"
+openai = "^1.66.5"
+
+[build-system]
+requires = ["poetry-core"]
+build-backend = "poetry.core.masonry.api"
+
+[tool.poetry.scripts]
+api-server = "api_server.main:app"
--- a/demos/use_cases/orchestrating_agents/run_demo.sh
+++ b/demos/use_cases/orchestrating_agents/run_demo.sh
@ -0,0 +1,47 @@
+#!/bin/bash
+set -e
+
+# Function to start the demo
+start_demo() {
+  # Step 1: Check if .env file exists
+  if [ -f ".env" ]; then
+    echo ".env file already exists. Skipping creation."
+  else
+    # Step 2: Create `.env` file and set OpenAI key
+    if [ -z "$OPENAI_API_KEY" ]; then
+      echo "Error: OPENAI_API_KEY environment variable is not set for the demo."
+      exit 1
+    fi
+
+    echo "Creating .env file..."
+    echo "OPENAI_API_KEY=$OPENAI_API_KEY" > .env
+    echo ".env file created with OPENAI_API_KEY."
+  fi
+
+  # Step 3: Start Arch
+  echo "Starting Arch with arch_config.yaml..."
+  archgw up arch_config.yaml
+
+  # Step 4: Start developer services
+  echo "Starting Network Agent using Docker Compose..."
+  docker compose up -d  # Run in detached mode
+}
+
+# Function to stop the demo
+stop_demo() {
+  # Step 1: Stop Docker Compose services
+  echo "Stopping Network Agent using Docker Compose..."
+  docker compose down
+
+  # Step 2: Stop Arch
+  echo "Stopping Arch..."
+  archgw down
+}
+
+# Main script logic
+if [ "$1" == "down" ]; then
+  stop_demo
+else
+  # Default action is to bring the demo up
+  start_demo
+fi
--- a/demos/use_cases/spotify_bearer_auth/arch_config.yaml
+++ b/demos/use_cases/spotify_bearer_auth/arch_config.yaml
@ -1,8 +1,10 @@
 version: v0.1
-listener:
-  address: 127.0.0.1
-  port: 8080 #If you configure port 443, you'll need to update the listener with tls_certificates
-  message_format: huggingface
+listeners:
+  ingress_traffic:
+    address: 0.0.0.0
+    port: 10000
+    message_format: openai
+    timeout: 30s

 overrides:
  optimize_context_window: true
--- a/docs/source/build_with_arch/rag.rst
+++ b/docs/source/build_with_arch/rag.rst
@ -46,7 +46,7 @@ Multi-Turn RAG (Follow-up Questions)
 Developers often `struggle <https://www.reddit.com/r/LocalLLaMA/comments/18mqwg6/best_practice_for_rag_with_followup_chat/>`_ to efficiently handle
 ``follow-up`` or ``clarification`` questions. Specifically, when users ask for changes or additions to previous responses, it requires developers to
 re-write prompts using LLMs with precise prompt engineering techniques. This process is slow, manual, error prone and adds signifcant latency to the
-user experience. Arch
+user experience.

 Arch is highly capable of accurately detecting and processing prompts in a multi-turn scenarios so that you can buil fast and accurate RAG apps in
 minutes. For additional details on how to build multi-turn RAG applications please refer to our :ref:`multi-turn <arch_multi_turn_guide>` docs.
--- a/docs/source/concepts/includes/arch_config.yaml
+++ b/docs/source/concepts/includes/arch_config.yaml
@ -1,10 +1,11 @@
 version: v0.1

-listener:
-  address: 0.0.0.0 # or 127.0.0.1
-  port: 10000
-  # Defines how Arch should parse the content from application/json or text/pain Content-type in the http request
-  message_format: huggingface
+listeners:
+  ingress_traffic:
+    address: 0.0.0.0
+    port: 10000
+    message_format: openai
+    timeout: 30s

 # Centralized way to manage LLMs, manage keys, retry logic, failover and limits in a central way
 llm_providers:
@ -51,11 +52,6 @@ prompt_targets:
        default: false
        enum: [true, false]

-error_target:
-  endpoint:
-    name: error_target_1
-    path: /error
-
 # Arch creates a round-robin load balancing between different endpoints, managed via the cluster subsystem.
 endpoints:
  app_server:
--- a/docs/source/concepts/llm_provider.rst
+++ b/docs/source/concepts/llm_provider.rst
@ -4,7 +4,7 @@ LLM Provider
 ============

 **LLM provider** is a top-level primitive in Arch, helping developers centrally define, secure, observe,
-and manage the usage of of their LLMs. Arch builds on Envoy's reliable `cluster subsystem <https://www.envoyproxy.io/docs/envoy/v1.31.2/intro/arch_overview/upstream/cluster_manager>`_
+and manage the usage of their LLMs. Arch builds on Envoy's reliable `cluster subsystem <https://www.envoyproxy.io/docs/envoy/v1.31.2/intro/arch_overview/upstream/cluster_manager>`_
 to manage egress traffic to LLMs, which includes intelligent routing, retry and fail-over mechanisms,
 ensuring high availability and fault tolerance. This abstraction also enables developers to seamlessly
 switching between LLM providers or upgrade LLM versions, simplifying the integration and scaling of LLMs
--- a/docs/source/concepts/prompt_target.rst
+++ b/docs/source/concepts/prompt_target.rst
@ -3,14 +3,13 @@
 Prompt Target
 ==============

-**Prompt Targets** are a fundamental component of Arch, enabling developers to define how different types of user prompts are processed and routed within their generative AI applications.
-This section provides an in-depth look at prompt targets, including their purpose, configuration, usage, and best practices to help you effectively leverage this feature in your projects.
+**Prompt Targets** are a core concept in Arch, empowering developers to clearly define how user prompts are interpreted, processed, and routed within their generative AI applications. Prompts can seamlessly be routed either to specialized AI agents capable of handling sophisticated, context-driven tasks or to targeted tools provided by your application, offering users a fast, precise, and personalized experience.
+
+This section covers the essentials of prompt targets—what they are, how to configure them, their practical uses, and recommended best practices—to help you fully utilize this feature in your applications.

 What Are Prompt Targets?
 ------------------------
-Prompt targets are predefined endpoints within Arch that handle specific types of user prompts.
-They act as the bridge between user inputs and your backend services or APIs, enabling Arch to route, process, and manage prompts efficiently.
-By defining prompt targets, you can separate your application's business logic from the complexities of prompt processing, ensuring a cleaner and more maintainable codebase.
+Prompt targets are endpoints within Arch that handle specific types of user prompts. They act as the bridge between user inputs and your backend agents or tools (APIs), enabling Arch to route, process, and manage prompts efficiently. Defining prompt targets helps you decouple your application's core logic from processing and handling complexities, leading to clearer code organization, better scalability, and easier maintenance.


 .. table::
@ -21,7 +20,7 @@ By defining prompt targets, you can separate your application's business logic f
    ====================    ============================================
    Intent Recognition      Identify the purpose of a user prompt.
    Parameter Extraction    Extract necessary data from the prompt.
-    API Invocation          Call relevant backend services or functions.
+    Invocation              Call relevant backend agents or tools (APIs).
    Response Handling       Process and return responses to the user.
    ====================    ============================================

@ -30,16 +29,15 @@ Key Features

 Below are the key features of prompt targets that empower developers to build efficient, scalable, and personalized GenAI solutions:

- **Modular Design**: Define multiple prompt targets to handle diverse functionalities.
- **Parameter Management**: Specify required and optional parameters for each target.
- **Function Integration**: Seamlessly connect prompts to backend APIs or functions.
+- **Design Scenarios**: Define prompt targets to effectively handle specific agentic scenarios.
+- **Input Management**: Specify required and optional parameters for each target.
+- **Tools Integration**: Seamlessly connect prompts to backend APIs or functions.
 - **Error Handling**: Direct errors to designated handlers for streamlined troubleshooting.
 - **Metadata Enrichment**: Attach additional context to prompts for enhanced processing.

 Configuring Prompt Targets
 --------------------------
-Configuring prompt targets involves defining them in Arch's configuration file.
-Each Prompt target specifies how a particular type of prompt should be handled, including the endpoint to invoke and any parameters required.
+Configuring prompt targets involves defining them in Arch's configuration file. Each Prompt target specifies how a particular type of prompt should be handled, including the endpoint to invoke and any parameters required.

 Basic Configuration
 ~~~~~~~~~~~~~~~~~~~
@ -50,37 +48,38 @@ A prompt target configuration includes the following elements:

 - ``name``: A unique identifier for the prompt target.
 - ``description``: A brief explanation of what the prompt target does.
- ``endpoint``: The API endpoint or function that handles the prompt.
+- ``endpoint``: Required if you want to call a tool or specific API. ``name`` and ``path`` ``http_method`` are the three attributes of the endpoint.
 - ``parameters`` (Optional): A list of parameters to extract from the prompt.

+.. _defining_prompt_target_parameters:
+
 Defining Parameters
 ~~~~~~~~~~~~~~~~~~~
 Parameters are the pieces of information that Arch needs to extract from the user's prompt to perform the desired action.
-Each parameter can be marked as required or optional.
-Here is a full list of parameter attributes that Arch can support:
+Each parameter can be marked as required or optional. Here is a full list of parameter attributes that Arch can support:

 .. table::
    :width: 100%

-    ====================      ============================================================================
+    ========================  ============================================================================
    **Attribute**             **Description**
-    ====================      ============================================================================
-    ``name``                  Specifies identifier of parameters
-    ``type``                  Specifies the data type of the parameter.
-    ``description``           Provides a human-readable explanation of the parameter's purpose.
-    ``required``              Indicates whether the parameter is mandatory or optional
+    ========================  ============================================================================
+    ``name (req.)``           Specifies name of the parameter.
+    ``description (req.)``    Provides a human-readable explanation of the parameter's purpose.
+    ``type (req.)``           Specifies the data type. Supported types include: **int**, **str**, **float**, **bool**, **list**, **set**, **dict**, **tuple**
+    ``in_path``               Indicates whether the parameter is part of the path in the endpoint url. Valid values: **true** or **false**
    ``default``               Specifies a default value for the parameter if not provided by the user.
-    ``items``                 Used in the context of arrays to define the schema of items within an array.
-    ``format``                Specifies a format for the parameter value, e.g., date and email
-    ``enum``                  Lists the allowable values for the parameter.
-    ``minimum``               Defines the minimum acceptable value for numeric parameters.
-    ``maximum``               Specifies the maximum acceptable value for numeric parameters.
-    ====================      ============================================================================
+    ``format``                Specifies a format for the parameter value. For example: `2019-12-31` for a date value.
+    ``enum``                  Lists of allowable values for the parameter with data type matching the ``type`` attribute. **Usage Example**: ``enum: ["celsius`", "fahrenheit"]``
+    ``items``                 Specifies the attribute of the elements when type equals **list**, **set**, **dict**, **tuple**. **Usage Example**: ``items: {"type": "str"}``
+    ``required``              Indicates whether the parameter is mandatory or optional. Valid values: **true** or **false**
+    ========================  ============================================================================

-Example Configuration
-~~~~~~~~~~~~~~~~~~~~~
+Example Configuration For Tools
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. code-block:: yaml
+    :caption: Tools and Function Calling Configuration Example

    prompt_targets:
      - name: get_weather
@ -99,16 +98,35 @@ Example Configuration
          name: api_server
          path: /weather

+Example Configuration For Agents
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. code-block:: yaml
+    :caption: Agent Orchestration Configuration Example
+
+    overrides:
+      use_agent_orchestrator: true
+
+    prompt_targets:
+      - name: sales_agent
+        description: handles queries related to sales and purchases
+
+      - name: issues_and_repairs
+        description: handles issues, repairs, or refunds
+
+      - name: escalate_to_human
+        description: escalates to human agent
+
+.. note::
+    Today, you can use Arch to coordinate more specific agentic scenarios via tools and function calling, or use it for high-level agent routing and hand off scenarios. In the future, we plan to offer you the ability to combine these two approaches for more complex scenarios. Please see `github issues <https://github.com/katanemo/archgw/issues/442>`_ for more details.

 Routing Logic
 -------------
-Prompt targets determine where and how user prompts are processed.
-Arch uses intelligent routing logic to ensure that prompts are directed to the appropriate targets based on their intent and context.
+Prompt targets determine where and how user prompts are processed. Arch uses intelligent routing logic to ensure that prompts are directed to the appropriate targets based on their intent and context.

 Default Targets
 ~~~~~~~~~~~~~~~
-For general-purpose prompts that do not match any specific prompt target, Arch routes them to a designated default target.
-This is useful for handling open-ended queries like document summarization or information extraction.
+For general-purpose prompts that do not match any specific prompt target, Arch routes them to a designated default target. This is useful for handling open-ended queries like document summarization or information extraction.

 Intent Matching
 ~~~~~~~~~~~~~~~
@ -125,5 +143,5 @@ For example:
 Summary
 --------
 Prompt targets are essential for defining how user prompts are handled within your generative AI applications using Arch.
-By carefully configuring prompt targets, you can ensure that prompts are accurately routed, necessary parameters are extracted, and backend services are invoked seamlessly.
-This modular approach not only simplifies your application's architecture but also enhances scalability, maintainability, and overall user experience.
+
+By carefully configuring prompt targets, you can ensure that prompts are accurately routed, necessary parameters are extracted, and backend services are invoked seamlessly. This modular approach not only simplifies your application's architecture but also enhances scalability, maintainability, and overall user experience.
--- a/docs/source/concepts/tech_overview/listener.rst
+++ b/docs/source/concepts/tech_overview/listener.rst
@ -5,7 +5,7 @@ Listener
 **Listener** is a top level primitive in Arch, which simplifies the configuration required to bind incoming
 connections from downstream clients, and for egress connections to LLMs (hosted or API)

-Arch builds on Envoy's Listener subsystem to streamline connection managemet for developers. Arch minimizes
+Arch builds on Envoy's Listener subsystem to streamline connection management for developers. Arch minimizes
 the complexity of Envoy's listener setup by using best-practices and exposing only essential settings,
 making it easier for developers to bind connections without deep knowledge of Envoy’s configuration model. This
 simplification ensures that connections are secure, reliable, and optimized for performance.
@ -13,7 +13,7 @@ simplification ensures that connections are secure, reliable, and optimized for
 Downstream (Ingress)
 ^^^^^^^^^^^^^^^^^^^^^^
 Developers can configure Arch to accept connections from downstream clients. A downstream listener acts as the
-primary entry point for incoming traffic, handling initial connection setup, including network filtering, gurdrails,
+primary entry point for incoming traffic, handling initial connection setup, including network filtering, guardrails,
 and additional network security checks. For more details on prompt security and safety,
 see :ref:`here <arch_overview_prompt_handling>`.

@ -27,7 +27,7 @@ address like ``arch.local:12000/v1`` for outgoing traffic. For more details on L
 Configure Listener
 ^^^^^^^^^^^^^^^^^^

-To configure a Downstream (Ingress) Listner, simply add the ``listener`` directive to your configuration file:
+To configure a Downstream (Ingress) Listener, simply add the ``listener`` directive to your configuration file:

 .. literalinclude:: ../includes/arch_config.yaml
    :language: yaml
--- a/docs/source/concepts/tech_overview/model_serving.rst
+++ b/docs/source/concepts/tech_overview/model_serving.rst
@ -5,7 +5,7 @@ Model Serving

 Arch is a set of `two` self-contained processes that are designed to run alongside your application
 servers (or on a separate host connected via a network). The first process is designated to manage low-level
-networking and HTTP related comcerns, and the other process is for model serving, which helps Arch make
+networking and HTTP related concerns, and the other process is for model serving, which helps Arch make
 intelligent decisions about the incoming prompts. The model server is designed to call the purpose-built
 LLMs in Arch.

@ -16,7 +16,7 @@ LLMs in Arch.

 Arch' is designed to be deployed in your cloud VPC, on a on-premises host, and can work on devices that don't
 have a GPU. Note, GPU devices are need for fast and cost-efficient use, so that Arch (model server, specifically)
-can process prompts quickly and forward control back to the applicaton host. There are three modes in which Arch
+can process prompts quickly and forward control back to the application host. There are three modes in which Arch
 can be configured to run its **model server** subsystem:

 Local Serving (CPU - Moderate)
@ -32,7 +32,7 @@ might not be available.
 Cloud Serving (GPU - Blazing Fast)
 ----------------------------------
 The command below instructs Arch to intelligently use GPUs locally for fast intent detection, but default to
-cloud serving for function calling and guardails scenarios to dramatically improve the speed and overall performance
+cloud serving for function calling and guardrails scenarios to dramatically improve the speed and overall performance
 of your applications.

 .. code-block:: console
@ -40,6 +40,6 @@ of your applications.
    $ archgw up

 .. Note::
-    Arch's model serving in the cloud is priced at $0.05M/token (156x cheaper than GPT-4o) with averlage latency
+    Arch's model serving in the cloud is priced at $0.05M/token (156x cheaper than GPT-4o) with average latency
    of 200ms (10x faster than GPT-4o). Please refer to our :ref:`Get Started <quickstart>` to know
    how to generate API keys for model serving
--- a/docs/source/concepts/tech_overview/prompt.rst
+++ b/docs/source/concepts/tech_overview/prompt.rst
@ -8,7 +8,7 @@ Arch relies on Envoy's HTTP `connection management <https://www.envoyproxy.io/do
 subsystem and its **prompt handler** subsystem engineered with purpose-built LLMs to
 implement critical functionality on behalf of developers so that you can stay focused on business logic.

-Arch's **prompt handler** subsystem interacts with the **model subsytem** through Envoy's cluster manager system to ensure robust, resilient and fault-tolerant experience in managing incoming prompts.
+Arch's **prompt handler** subsystem interacts with the **model subsystem** through Envoy's cluster manager system to ensure robust, resilient and fault-tolerant experience in managing incoming prompts.

 .. seealso::
   Read more about the :ref:`model subsystem <model_serving>` and how the LLMs are hosted in Arch.
@ -28,7 +28,7 @@ Prompt Guard
 -----------------

 Arch is engineered with `Arch-Guard <https://huggingface.co/collections/katanemo/arch-guard-6702bdc08b889e4bce8f446d>`_, an industry leading safety layer, powered by a
-compact and high-performimg LLM that monitors incoming prompts to detect and reject jailbreak attempts -
+compact and high-performing LLM that monitors incoming prompts to detect and reject jailbreak attempts -
 ensuring that unauthorized or harmful behaviors are intercepted early in the process.

 To add jailbreak guardrails, see example below:
@ -50,7 +50,7 @@ Prompt Targets
 --------------

 Once a prompt passes any configured guardrail checks, Arch processes the contents of the incoming conversation
-and identifies where to forwad the conversation to via its ``prompt target`` primitve. Prompt targets are endpoints
+and identifies where to forward the conversation to via its ``prompt target`` primitive. Prompt targets are endpoints
 that receive prompts that are processed by Arch. For example, Arch enriches incoming prompts with metadata like knowing
 when a user's intent has changed so that you can build faster, more accurate RAG apps.

@ -72,7 +72,7 @@ Intent Matching

 Arch uses fast text embedding and intent recognition approaches to first detect the intent of each incoming prompt.
 This intent matching phase analyzes the prompt's content and matches it against predefined prompt targets, ensuring that each prompt is forwarded to the most appropriate endpoint.
-Arch’s intent matching framework considers both the name and description of each prompt target, and uses a composite matching score between embedding similarity and intent classification scores to enchance accuracy in forwarding decisions.
+Arch’s intent matching framework considers both the name and description of each prompt target, and uses a composite matching score between embedding similarity and intent classification scores to enhance accuracy in forwarding decisions.

 - **Intent Recognition**: NLI techniques further refine the matching process by evaluating the semantic alignment between the prompt and potential targets.

--- a/docs/source/concepts/tech_overview/request_lifecycle.rst
+++ b/docs/source/concepts/tech_overview/request_lifecycle.rst
@ -5,7 +5,7 @@ Request Lifecycle

 Below we describe the events in the lifecycle of a request passing through an Arch gateway instance. We first
 describe how Arch fits into the request path and then the internal events that take place following
-the arrival of a request at Arch from downtream clients. We follow the request until the corresponding
+the arrival of a request at Arch from downstream clients. We follow the request until the corresponding
 dispatch upstream and the response path.

 .. image:: /_static/img/network-topology-ingress-egress.jpg
@ -59,7 +59,7 @@ The request processing path in Arch has three main parts:
  lifecycle. The downstream and upstream HTTP/2 codec lives here.
 * :ref:`Prompt handler subsystem <arch_overview_prompt_handling>` which is responsible for selecting and
  forwarding prompts ``prompt_targets`` and establishes the lifecycle of any **upstream** connection to a
-  hosted endpoint that implements domain-specific business logic for incoming promots. This is where knowledge
+  hosted endpoint that implements domain-specific business logic for incoming prompts. This is where knowledge
  of targets and endpoint health, load balancing and connection pooling exists.
 * :ref:`Model serving subsystem <model_serving>` which helps Arch make intelligent decisions about the
  incoming prompts. The model server is designed to call the purpose-built LLMs in Arch.
@ -67,7 +67,7 @@ The request processing path in Arch has three main parts:
 The three subsystems are bridged with either the HTTP router filter, and the cluster manager subsystems of Envoy.

 Also, Arch utilizes `Envoy event-based thread model <https://blog.envoyproxy.io/envoy-threading-model-a8d44b922310>`_.
-A main thread is responsible forthe server lifecycle, configuration processing, stats, etc. and some number of
+A main thread is responsible for the server lifecycle, configuration processing, stats, etc. and some number of
 :ref:`worker threads <arch_overview_threading>` process requests. All threads operate around an event loop (`libevent <https://libevent.org/>`_)
 and any given downstream TCP connection will be handled by exactly one worker thread for its lifetime. Each worker
 thread maintains its own pool of TCP connections to upstream endpoints.
@ -99,7 +99,7 @@ A brief outline of the lifecycle of a request and response using the example con
   that harmful or unwanted behaviors are detected early in the request processing pipeline.

 3. **Intent Matching**:
-   The decrypted data stream is deframed by the HTTP/2 codec in Arch's HTTP connection manager. Arch performs
+   The decrypted data stream is de-framed by the HTTP/2 codec in Arch's HTTP connection manager. Arch performs
   intent matching via is **prompt-handler** subsystem using the name and description of the defined prompt targets,
   determining which endpoint should handle the prompt.

@ -162,7 +162,7 @@ Post-request processing
 Once a request completes, the stream is destroyed. The following also takes places:

 * The post-request :ref:`monitoring <monitoring>` are updated (e.g. timing, active requests, upgrades, health checks).
-  Some statistics are updated earlier however, during request processing. Stats are batchedand written by the main
+  Some statistics are updated earlier however, during request processing. Stats are batched and written by the main
  thread periodically.
 * :ref:`Access logs <arch_access_logging>` are written to the access log
 * :ref:`Trace <arch_overview_tracing>` spans are finalized. If our example request was traced, a
--- a/docs/source/concepts/tech_overview/terminology.rst
+++ b/docs/source/concepts/tech_overview/terminology.rst
@ -7,12 +7,12 @@ A few definitions before we dive into the main architecture documentation. Also
 to keep things consistent in logs and traces, and introduces and clarifies concepts are is relates to LLM applications.

 **Agent**: An application that uses LLMs to handle wide-ranging tasks from users via prompts. This could be as simple
-as retrieving or summarizing data from an API, or being able to trigger compleix actions like adjusting ad campaigns, or
+as retrieving or summarizing data from an API, or being able to trigger complex actions like adjusting ad campaigns, or
 changing travel plans via prompts.

 **Arch Config**: Arch operates based on a configuration that controls the behavior of a single instance of the Arch gateway.
 This where you enable capabilities like LLM routing, fast function calling (via prompt_targets), applying guardrails, and enabling critical
-features like metrics and tracing. For the full configuration reference of `arch_config.yaml` see :ref:`here <configuration_refernce>`.
+features like metrics and tracing. For the full configuration reference of `arch_config.yaml` see :ref:`here <configuration_reference>`.

 **Downstream(Ingress)**: An downstream client (web application, etc.) connects to Arch, sends prompts, and receives responses.

@ -37,11 +37,11 @@ code to LLMs.
 undifferentiated work in building generative AI apps. Prompt targets are endpoints that receive prompts that are processed by Arch.
 For example, Arch enriches incoming prompts with metadata like knowing when a request is a follow-up or clarifying prompt so that you
 can build faster, more accurate retrieval (RAG) apps. To support agentic apps, like scheduling travel plans or sharing comments on a
-document - via prompts, Arch uses its function calling abilities to extract critical information fromthe incoming prompt (or a set of
+document - via prompts, Arch uses its function calling abilities to extract critical information from the incoming prompt (or a set of
 prompts) needed by a downstream backend API or function call before calling it directly.

 **Model Serving**: Arch is a set of `two` self-contained processes that are designed to run alongside your application servers
-(or on a separate hostconnected via a network).The :ref:`model serving <model_serving>` process helps Arch make intelligent decisions
+(or on a separate host connected via a network).The :ref:`model serving <model_serving>` process helps Arch make intelligent decisions
 about the incoming prompts. The model server is designed to call the (fast) purpose-built LLMs in Arch.

 **Error Target**: :ref:`Error targets <error_target>` are those endpoints that receive forwarded errors from Arch when issues arise,
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@ -15,7 +15,7 @@ from sphinxawesome_theme.postprocess import Icons
 project = "Arch Docs"
 copyright = "2025, Katanemo Labs, Inc"
 author = "Katanemo Labs, Inc"
-release = " v0.2.1"
+release = " v0.2.6"

 # -- General configuration ---------------------------------------------------
 # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration
--- a/docs/source/get_started/intro_to_arch.rst
+++ b/docs/source/get_started/intro_to_arch.rst
@ -3,10 +3,17 @@
 Intro to Arch
 =============

-Arch is an intelligent `(Layer 7) <https://www.cloudflare.com/learning/ddos/what-is-layer-7/>`_ gateway designed for generative AI apps, agents, copilots that work with prompts.
-Engineered with purpose-built large language models (LLMs), Arch handles all the critical but undifferentiated tasks related to the handling and processing of prompts, including
-detecting and rejecting jailbreak attempts, intelligently calling “backend” APIs to fulfill the user's request represented in a prompt, routing to and offering disaster recovery
-between upstream LLMs, and managing the observability of prompts and LLM interactions in a centralized way.
+Arch is an intelligent proxy server designed agentic applications. **Move faster** by letting Arch handle the **pesky heavy lifting** in building agents:
+fast input clarification, agent routing, seamless integration of prompts with tools for common tasks, and unified access and observability of LLMs.
+
+Past the thrill of an AI demo, have you found yourself hitting these walls? You know, the all too familiar ones:
+
+- You break a prompt into specialized ones, but **get stuck writing routing** and handoff logic?
+- You want use new LLMs, but **struggle to quickly add LLMs** without writing integration logic?
+- You're **trapped in tedious prompting work** to clarify inputs and user intents?
+- You're **wasting cycles** choosing and integrating **code for observability** instead of it just happening transparently?
+
+And you think to yourself, can't I move faster by focusing on higher-level objectives in a language and framework agnostic way? Well, you can!

 .. figure:: /_static/img/arch_network_diagram_high_level.png
   :width: 100%
@ -15,7 +22,7 @@ between upstream LLMs, and managing the observability of prompts and LLM interac
   High-level network flow of where Arch Gateway sits in your agentic stack. Designed for both ingress and egress prompt traffic.


-**The project was born out of the belief that:**
+**Arch Gateway was built by the contributors of Envoy Proxy with the belief that:**

  *Prompts are nuanced and opaque user requests, which require the same capabilities as traditional HTTP requests
  including secure handling, intelligent routing, robust observability, and integration with backend (API)
@ -28,7 +35,7 @@ Arch takes a dependency on Envoy and is a self-contained process that is designe
 Arch uses Envoy's HTTP connection management subsystem, HTTP L7 filtering and telemetry capabilities to extend the functionality exclusively for prompts and LLMs.
 This gives Arch several advantages:

-* Arch builds on Envoy's proven success. Envoy is used at masssive scale by the leading technology companies of our time including `AirBnB <https://www.airbnb.com>`_, `Dropbox <https://www.dropbox.com>`_, `Google <https://www.google.com>`_, `Reddit <https://www.reddit.com>`_, `Stripe <https://www.stripe.com>`_, etc. Its battle tested and scales linearly with usage and enables developers to focus on what really matters: application features and business logic.
+* Arch builds on Envoy's proven success. Envoy is used at massive scale by the leading technology companies of our time including `AirBnB <https://www.airbnb.com>`_, `Dropbox <https://www.dropbox.com>`_, `Google <https://www.google.com>`_, `Reddit <https://www.reddit.com>`_, `Stripe <https://www.stripe.com>`_, etc. Its battle tested and scales linearly with usage and enables developers to focus on what really matters: application features and business logic.

 * Arch works with any application language. A single Arch deployment can act as gateway for AI applications written in Python, Java, C++, Go, Php, etc.

@ -47,7 +54,7 @@ These LLMs are designed to be best-in-class for critical prompt-related tasks li
  With prompt guardrails you can prevent ``jailbreak attempts`` present in user's prompts without having to write a single line of code.
  To learn more about how to configure guardrails available in Arch, read :ref:`Prompt Guard <prompt_guard>`.

-**Traffic Management:** Arch offers several capabilities for LLM calls originating from your applications, including smart retries on errors from upstream LLMs, and automatic cutover to other LLMs configured in Arch for continuous availability and disaster recovery scenarios.
+**Traffic Management:** Arch offers several capabilities for LLM calls originating from your applications, including smart retries on errors from upstream LLMs, and automatic cut-over to other LLMs configured in Arch for continuous availability and disaster recovery scenarios.
 Arch extends Envoy's `cluster subsystem <https://www.envoyproxy.io/docs/envoy/latest/intro/arch_overview/upstream/cluster_manager>`_ to manage upstream connections to LLMs so that you can build resilient AI applications.

 **Front/edge Gateway:** There is substantial benefit in using the same software at the edge (observability, traffic shaping algorithms, applying guardrails, etc.) as for outbound LLM inference use cases.
--- a/docs/source/get_started/overview.rst
+++ b/docs/source/get_started/overview.rst
@ -3,7 +3,11 @@

 Overview
 ============
-Welcome to Arch, the intelligent prompt gateway designed to help developers build **fast**, **secure**, and **personalized** generative AI apps at ANY scale.
+Welcome to Arch, The intelligent (edge and LLM) proxy server for agentic applications.
+
+Move **faster** by letting Arch handle the pesky heavy lifting in building agents: **fast input clarification**, **agent routing**,
+seamless integration of prompts with **tools for common tasks**, and **unified access and observability of LLMs**.
+
 In this documentation, you will learn how to quickly set up Arch to trigger API calls via prompts, apply prompt guardrails without writing any application-level logic,
 simplify the interaction with upstream LLMs, and improve observability all while simplifying your application development process.

--- a/docs/source/get_started/quickstart.rst
+++ b/docs/source/get_started/quickstart.rst
@ -25,7 +25,7 @@ Arch's CLI allows you to manage and interact with the Arch gateway efficiently.

   $ python -m venv venv
   $ source venv/bin/activate   # On Windows, use: venv\Scripts\activate
-   $ pip install archgw==0.2.1
+   $ pip install archgw==0.2.6


 Build AI Agent with Arch Gateway
@ -42,11 +42,12 @@ Create ``arch_config.yaml`` file with the following content:

   version: v0.1

-   listener:
-     address: 0.0.0.0
-     port: 10000
-     message_format: huggingface
-     connect_timeout: 0.005s
+  listeners:
+    ingress_traffic:
+      address: 0.0.0.0
+      port: 10000
+      message_format: openai
+      timeout: 30s

   llm_providers:
     - name: gpt-4o
@ -144,22 +145,23 @@ Create ``arch_config.yaml`` file with the following content:

   version: v0.1

-   listener:
-     address: 0.0.0.0
-     port: 10000
-     message_format: huggingface
-     connect_timeout: 0.005s
+  listeners:
+    egress_traffic:
+      address: 0.0.0.0
+      port: 12000
+      message_format: openai
+      timeout: 30s

   llm_providers:
     - name: gpt-4o
       access_key: $OPENAI_API_KEY
-       provider: openai
+       provider_interface: openai
       model: gpt-4o
       default: true

     - name: ministral-3b
       access_key: $MISTRAL_API_KEY
-       provider: mistral
+       provider_interface: openai
       model: ministral-3b-latest

 Step 2. Start arch gateway
--- a/docs/source/guides/agent_routing.rst
+++ b/docs/source/guides/agent_routing.rst
@ -0,0 +1,105 @@
+.. _agent_routing:
+
+Agent Routing and Hand Off
+===========================
+
+Agent Routing and Hand Off is a key feature in Arch that enables intelligent routing of user prompts to specialized AI agents or human agents based on the nature and complexity of the user's request.
+
+This capability significantly enhances the efficiency and personalization of interactions, ensuring each prompt receives the most appropriate and effective handling. The following section describes
+the workflow, configuration, and implementation of Agent routing and hand off in Arch.
+
+#. **Agent Selection**
+   When a user submits a prompt, Arch analyzes the input to determine the intent and complexity. Based on the analysis, Arch selects the most suitable agent configured within your application to handle the specific category of the user's request—such as sales inquiries, technical issues, or complex scenarios requiring human attention.
+
+#. **Prompt Routing**
+   After selecting the appropriate agent, Arch routes the user's prompt to the designated agent's endpoint and waits for the agent to respond back with the processed output or further instructions.
+
+#. **Hand Off**
+   Based on follow-up queries from the user, Arch repeats the process of analysis, agent selection, and routing to ensure a seamless hand off between AI agents as needed.
+
+.. code-block:: yaml
+    :caption: Agent Routing and Hand Off Configuration Example
+
+    prompt_targets:
+      - name: sales_agent
+        description: Handles queries related to sales and purchases
+
+      - name: issues_and_repairs
+        description: handles issues, repairs, or refunds
+
+      - name: escalate_to_human
+        description: escalates to human agent
+
+.. code-block:: python
+    :caption: Agent Routing and Hand Off Implementation Example via FastAPI
+
+    class Agent:
+        def __init__(self, role: str, instructions: str):
+            self.system_prompt = f"You are a {role}.\n{instructions}"
+
+        def handle(self, req: ChatCompletionsRequest):
+            messages = [{"role": "system", "content": self.get_system_prompt()}] + [
+                message.model_dump() for message in req.messages
+            ]
+            return call_openai(messages, req.stream) #call_openai is a placeholder for the actual API call
+
+        def get_system_prompt(self) -> str:
+            return self.system_prompt
+
+    # Define your agents
+    AGENTS = {
+        "sales_agent": Agent(
+            role="sales agent",
+            instructions=(
+                "Always answer in a sentence or less.\n"
+                "Follow the following routine with the user:\n"
+                "1. Engage\n"
+                "2. Quote ridiculous price\n"
+                "3. Reveal caveat if user agrees."
+            ),
+        ),
+        "issues_and_repairs": Agent(
+            role="issues and repairs agent",
+            instructions="Propose a solution, offer refund if necessary.",
+        ),
+        "escalate_to_human": Agent(
+            role="human escalation agent", instructions="Escalate issues to a human."
+        ),
+        "unknown_agent": Agent(
+            role="general assistant", instructions="Assist the user in general queries."
+        ),
+    }
+
+    #handle the request from arch gateway
+    @app.post("/v1/chat/completions")
+    def completion_api(req: ChatCompletionsRequest, request: Request):
+
+        agent_name = req.metadata.get("agent-name", "unknown_agent")
+        agent = AGENTS.get(agent_name)
+        logger.info(f"Routing to agent: {agent_name}")
+
+        return agent.handle(req)
+
+.. note::
+    The above example demonstrates a simple implementation of Agent Routing and Hand Off using FastAPI. For the full implementation of this example
+    please see our `GitHub demo <https://github.com/katanemo/archgw/tree/main/demos/use_cases/orchestrating_agents>`_.
+
+Example Use Cases
+-----------------
+Agent Routing and Hand Off is particularly beneficial in scenarios such as:
+
+- **Customer Support**: Routing common customer queries to automated support agents, while escalating complex or sensitive issues to human support staff.
+- **Sales and Marketing**: Automatically directing potential leads and sales inquiries to specialized sales agents for timely and targeted follow-ups.
+- **Technical Assistance**: Managing user-reported issues, repairs, or refunds by assigning them to the correct technical or support agent efficiently.
+
+Best Practices and Tips
+------------------------
+When implementing Agent Routing and Hand Off in your applications, consider these best practices:
+
+- Clearly define agent responsibilities: Ensure each agent or human endpoint has a clear, specific description of the prompts they handle, reducing mis-routing.
+- Monitor and optimize routes: Regularly review how prompts are routed to adjust and optimize agent definitions and configurations.
+
+.. note::
+    To observe traffic to and from agents, please read more about :ref:`observability <observability>` in Arch.
+
+By carefully configuring and managing your Agent routing and hand off, you can significantly improve your application's responsiveness, performance, and overall user satisfaction.
--- a/docs/source/guides/function_calling.rst
+++ b/docs/source/guides/function_calling.rst
@ -118,6 +118,9 @@ Specify the parameters your function needs and how Arch should interpret these.
          name: api_server
          path: /weather

+.. Note::
+    For a complete refernce of attributes that you can configure in a prompt target, see :ref:`here <defining_prompt_target_parameters>`.
+
 Step 3: Arch Takes Over
 ~~~~~~~~~~~~~~~~~~~~~~~
 Once you have defined the functions and configured the prompt targets, Arch Gateway takes care of the remaining work.
--- a/docs/source/guides/includes/arch_config.yaml
+++ b/docs/source/guides/includes/arch_config.yaml
@ -1,10 +1,11 @@
 version: v0.1

-listener:
-  address: 0.0.0.0 # or 127.0.0.1
-  port: 10000
-  # Defines how Arch should parse the content from application/json or text/pain Content-type in the http request
-  message_format: huggingface
+listeners:
+  ingress_traffic:
+    address: 0.0.0.0
+    port: 10000
+    message_format: openai
+    timeout: 30s

 # Centralized way to manage LLMs, manage keys, retry logic, failover and limits in a central way
 llm_providers:
@ -53,11 +54,6 @@ prompt_targets:
        default: false
        enum: [true, false]

-error_target:
-  endpoint:
-    name: error_target_1
-    path: /error
-
 # Arch creates a round-robin load balancing between different endpoints, managed via the cluster subsystem.
 endpoints:
  app_server:
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@ -12,11 +12,11 @@ Welcome to Arch!
   <p>Build <strong>fast</strong>, <strong>observable</strong>, and <strong>personalized</strong> GenAI apps</p>
   </div>

-   <a href="https://www.producthunt.com/posts/arch-3?embed=true&utm_source=badge-top-post-badge&utm_medium=badge&utm_souce=badge-arch&#0045;3" target="_blank"><img src="https://api.producthunt.com/widgets/embed-image/v1/top-post-badge.svg?post_id=565761&theme=light&period=daily" alt="Arch - Build&#0032;fast&#0044;&#0032;hyper&#0045;personalized&#0032;agents&#0032;with&#0032;intelligent&#0032;infra | Product Hunt" style="width: 250px; height: 54px;" width="250" height="54" /></a>
+   <a href="https://www.producthunt.com/posts/arch-3?embed=true&utm_source=badge-top-post-badge&utm_medium=badge&utm_souce=badge-arch&#0045;3" target="_blank"><img src="https://api.producthunt.com/widgets/embed-image/v1/top-post-badge.svg?post_id=565761&theme=dark&period=daily&t=1742433071161" alt="Arch - Build&#0032;fast&#0044;&#0032;hyper&#0045;personalized&#0032;agents&#0032;with&#0032;intelligent&#0032;infra | Product Hunt" style="width: 250px; height: 54px;" width="250" height="54" /></a>

-`Arch <https://github.com/katanemo/arch>`_ is an intelligent gateway for agents - an infrastructure primitive for GenAI (built by the contributors of `Envoy <https://www.envoyproxy.io/>`_ ). The project was born out of the belief that:
+`Arch <https://github.com/katanemo/arch>`_ is an intelligent (edge and LLM) proxy, exclusively designed for prompts and agents - and built by the contributors of the widely adopted and loved `Envoy <https://www.envoyproxy.io/>`_ ).

-  *Prompts are nuanced and opaque user requests, which require the same capabilities as traditional HTTP requests including secure handling, intelligent routing, robust observability, and integration with backend (API) systems for personalization - all outside business logic.*
+**Move faster** by letting Arch handle the **pesky heavy lifting** in building agents: fast input clarification, agent routing, seamless integration of prompts with tools for common tasks, and unified access and observability of LLMs - all outside business logic.

 .. tab-set::

@ -50,6 +50,7 @@ Welcome to Arch!
      :maxdepth: 2

      guides/prompt_guard
+      guides/agent_routing
      guides/function_calling
      guides/observability/observability

--- a/docs/source/resources/configuration_reference.rst
+++ b/docs/source/resources/configuration_reference.rst
@ -1,9 +1,9 @@
-.. _configuration_refernce:
+.. _configuration_reference:

 Configuration Reference
 =======================

-The following is a complete reference of the ``arch_conifg.yml`` that controls the behavior of a single instance of
+The following is a complete reference of the ``arch_config.yml`` that controls the behavior of a single instance of
 the Arch gateway. This where you enable capabilities like routing to upstream LLm providers, defining prompt_targets
 where prompts get routed to, apply guardrails, and enable critical agent observability features.

--- a/docs/source/resources/includes/arch_config_full_reference.yaml
+++ b/docs/source/resources/includes/arch_config_full_reference.yaml
@ -1,14 +1,16 @@
 version: v0.1

 listeners:
-  prompt_gateway:
+  ingress_traffic:
    address: 0.0.0.0
    port: 10000
    message_format: openai
    timeout: 5s
-  llm_gateway:
+  egress_traffic:
    address: 0.0.0.0
    port: 12000
+    message_format: openai
+    timeout: 5s

 # Arch creates a round-robin load balancing between different endpoints, managed via the cluster subsystem.
 endpoints:
@ -33,14 +35,6 @@ llm_providers:
    access_key: $OPENAI_API_KEY
    model: gpt-4o
    default: true
-    rate_limits:
-      selector: #optional headers, to add rate limiting based on http headers like JWT tokens or API keys
-        http_header:
-          name: Authorization
-          value: "" # Empty value means each separate value has a separate limit
-      limit:
-        tokens: 100000 # Tokens per unit
-        unit: minute

  - name: Mistral8x7b
    provider_interface: openai
@ -54,8 +48,8 @@ llm_providers:

 # provides a way to override default settings for the arch system
 overrides:
-  # By default Arch uses an NLI + embedding approach to match an incomming prompt to a prompt target.
-  # The intent matching threshold is kept at 0.80, you can overide this behavior if you would like
+  # By default Arch uses an NLI + embedding approach to match an incoming prompt to a prompt target.
+  # The intent matching threshold is kept at 0.80, you can override this behavior if you would like
  prompt_target_intent_matching_threshold: 0.60

 # default system prompt used by all prompt targets
@ -96,11 +90,6 @@ prompt_targets:
        default: false
        enum: [true, false]

-error_target:
-  endpoint:
-    name: error_target_1
-    path: /error
-
 tracing:
  # sampling rate. Note by default Arch works on OpenTelemetry compatible tracing.
  sampling_rate: 0.1
--- a/Show more
+++ b/Show more