Update arch_config and add tests for arch config file (#407)

2026-07-20 16:41:04 +02:00 · 2025-02-14 19:28:10 -08:00 · 2025-02-14 19:28:10 -08:00 · e40b13be05
commit e40b13be05
parent d0a783cca8
31 changed files with 379 additions and 212 deletions
--- a/.github/workflows/validate_arch_config.yml
+++ b/.github/workflows/validate_arch_config.yml
@ -0,0 +1,31 @@
+name: arch config tests
+
+on:
+  push:
+    branches:
+      - main
+  pull_request:
+
+jobs:
+  validate_arch_config:
+    runs-on: ubuntu-latest
+    defaults:
+      run:
+        working-directory: .
+
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v3
+
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: "3.12"
+
+      - name: build arch docker image
+        run: |
+          docker build  -f arch/Dockerfile . -t katanemo/archgw
+
+      - name: validate arch config
+        run: |
+          bash arch/validate_arch_config.sh
--- a/arch/arch_config_schema.yaml
+++ b/arch/arch_config_schema.yaml
@ -3,21 +3,38 @@ type: object
 properties:
  version:
    type: string
-  listener:
+  listeners:
    type: object
-    properties:
-      address:
-        type: string
-      port:
-        type: integer
-      message_format:
-        type: string
-      connect_timeout:
-        type: string
    additionalProperties: false
-    required:
-      - address
-      - port
+    properties:
+      ingress_traffic:
+        type: object
+        properties:
+          address:
+            type: string
+          port:
+            type: integer
+          message_format:
+            type: string
+            enum:
+              - openai
+          timeout:
+            type: string
+        additionalProperties: false
+      egress_traffic:
+        type: object
+        properties:
+          address:
+            type: string
+          port:
+            type: integer
+          message_format:
+            type: string
+            enum:
+              - openai
+          timeout:
+            type: string
+        additionalProperties: false
  endpoints:
    type: object
    patternProperties:
@ -107,7 +124,10 @@ properties:
              required:
                type: boolean
              default:
-                type: string
+                anyOf:
+                  - type: string
+                  - type: integer
+                  - type: boolean
              description:
                type: string
              type:
@ -115,7 +135,10 @@ properties:
              enum:
                type: array
                items:
-                  type: string
+                  anyOf:
+                    - type: string
+                    - type: integer
+                    - type: boolean
              in_path:
                type: boolean
              format:
@ -224,5 +247,4 @@ properties:
 additionalProperties: false
 required:
  - version
-  - listener
  - llm_providers
--- a/arch/envoy.template.yaml
+++ b/arch/envoy.template.yaml
@ -29,11 +29,11 @@ stats_config:
      - 180000
 static_resources:
  listeners:
-    - name: arch_listener_http
+    - name: ingress_traffic
      address:
        socket_address:
-          address: 0.0.0.0
-          port_value: 10000
+          address: {{ prompt_gateway_listener.address }}
+          port_value: {{ prompt_gateway_listener.port }}
      traffic_direction: INBOUND
      filter_chains:
        - filters:
@ -55,7 +55,7 @@ static_resources:
                  random_sampling:
                    value: {{ arch_tracing.random_sampling }}
                {% endif %}
-                stat_prefix: arch_listener_http
+                stat_prefix: ingress_traffic
                codec_type: AUTO
                scheme_header_transformation:
                  scheme_to_overwrite: https
@ -76,13 +76,13 @@ static_resources:
                          route:
                            auto_host_rewrite: true
                            cluster: arch_prompt_gateway_listener
-                            timeout: 60s
+                            timeout: {{ prompt_gateway_listener.timeout }}
                http_filters:
                  - name: envoy.filters.http.router
                    typed_config:
                      "@type": type.googleapis.com/envoy.extensions.filters.http.router.v3.Router

-    - name: arch_prompt_gateway_listener
+    - name: ingress_traffic_prompt
      address:
        socket_address:
          address: 0.0.0.0
@ -104,11 +104,11 @@ static_resources:
                        envoy_grpc:
                          cluster_name: opentelemetry_collector
                        timeout: 0.250s
-                      service_name: prompt_processor
+                      service_name: ingress_traffic
                  random_sampling:
                    value: {{ arch_tracing.random_sampling }}
                {% endif %}
-                stat_prefix: arch_prompt_gateway_listener
+                stat_prefix: ingress_traffic
                codec_type: AUTO
                scheme_header_transformation:
                  scheme_to_overwrite: https
@ -201,7 +201,7 @@ static_resources:
                    typed_config:
                      "@type": type.googleapis.com/envoy.extensions.filters.http.router.v3.Router

-    - name: arch_internal
+    - name: egress_api_traffic
      address:
        socket_address:
          address: 0.0.0.0
@ -223,11 +223,11 @@ static_resources:
                        envoy_grpc:
                          cluster_name: opentelemetry_collector
                        timeout: 0.250s
-                      service_name: prompt_processor
+                      service_name: egress_api_traffic
                  random_sampling:
                    value: {{ arch_tracing.random_sampling }}
                {% endif %}
-                stat_prefix: arch_internal
+                stat_prefix: egress_api_traffic
                codec_type: AUTO
                scheme_header_transformation:
                  scheme_to_overwrite: https
@ -273,13 +273,12 @@ static_resources:
                    typed_config:
                      "@type": type.googleapis.com/envoy.extensions.filters.http.router.v3.Router

-
-    - name: arch_listener_http_llm
+    - name: egress_traffic
      address:
        socket_address:
-          address: 0.0.0.0
-          port_value: 12000
-      traffic_direction: INBOUND
+          address: {{ llm_gateway_listener.address }}
+          port_value: {{ llm_gateway_listener.port }}
+      traffic_direction: OUTBOUND
      filter_chains:
        - filters:
            - name: envoy.filters.network.http_connection_manager
@ -300,7 +299,7 @@ static_resources:
                  random_sampling:
                    value: {{ arch_tracing.random_sampling }}
                {% endif %}
-                stat_prefix: arch_listener_http
+                stat_prefix: egress_traffic
                codec_type: AUTO
                scheme_header_transformation:
                  scheme_to_overwrite: https
@ -321,14 +320,13 @@ static_resources:
                          route:
                            auto_host_rewrite: true
                            cluster: arch_listener_llm
-                            timeout: 60s
+                            timeout: {{ llm_gateway_listener.timeout }}
                http_filters:
                  - name: envoy.filters.http.router
                    typed_config:
                      "@type": type.googleapis.com/envoy.extensions.filters.http.router.v3.Router

-
-    - name: arch_listener_llm
+    - name: egress_traffic_llm
      address:
        socket_address:
          address: 0.0.0.0
@ -349,11 +347,11 @@ static_resources:
                        envoy_grpc:
                          cluster_name: opentelemetry_collector
                        timeout: 0.250s
-                      service_name: llm_gateway
+                      service_name: egress_traffic_llm
                  random_sampling:
                    value: {{ arch_tracing.random_sampling }}
                {% endif %}
-                stat_prefix: arch_listener_http
+                stat_prefix: egress_traffic
                codec_type: AUTO
                scheme_header_transformation:
                  scheme_to_overwrite: https
@ -443,7 +441,7 @@ static_resources:

  clusters:
    - name: openai
-      connect_timeout: 5s
+      connect_timeout: 0.5s
      type: LOGICAL_DNS
      dns_lookup_family: V4_ONLY
      lb_policy: ROUND_ROBIN
@ -467,7 +465,7 @@ static_resources:
              tls_minimum_protocol_version: TLSv1_2
              tls_maximum_protocol_version: TLSv1_3
    - name: mistral
-      connect_timeout: 5s
+      connect_timeout: 0.5s
      type: LOGICAL_DNS
      dns_lookup_family: V4_ONLY
      lb_policy: ROUND_ROBIN
@ -488,7 +486,7 @@ static_resources:
          sni: api.mistral.ai
    {% for internal_cluster in ["arch_fc", "model_server"] %}
    - name: {{ internal_cluster }}
-      connect_timeout: 5s
+      connect_timeout: 0.5s
      type: STRICT_DNS
      dns_lookup_family: V4_ONLY
      lb_policy: ROUND_ROBIN
@ -504,7 +502,7 @@ static_resources:
                  hostname: {{ internal_cluster }}
    {% endfor %}
    - name: mistral_7b_instruct
-      connect_timeout: 5s
+      connect_timeout: 0.5s
      type: STRICT_DNS
      dns_lookup_family: V4_ONLY
      lb_policy: ROUND_ROBIN
@ -523,7 +521,7 @@ static_resources:
      {% if cluster.connect_timeout -%}
      connect_timeout: {{ cluster.connect_timeout }}
      {% else -%}
-      connect_timeout: 5s
+      connect_timeout: 0.5s
      {% endif -%}
      type: LOGICAL_DNS
      dns_lookup_family: V4_ONLY
@ -557,7 +555,7 @@ static_resources:

 {% for local_llm_provider in local_llms %}
    - name: {{ local_llm_provider.name }}
-      connect_timeout: 5s
+      connect_timeout: 0.5s
      type: LOGICAL_DNS
      dns_lookup_family: V4_ONLY
      lb_policy: ROUND_ROBIN
@ -589,7 +587,7 @@ static_resources:

 {% endfor %}
    - name: arch_internal
-      connect_timeout: 5s
+      connect_timeout: 0.5s
      type: LOGICAL_DNS
      dns_lookup_family: V4_ONLY
      lb_policy: ROUND_ROBIN
@ -605,7 +603,7 @@ static_resources:
                  hostname: arch_internal

    - name: arch_prompt_gateway_listener
-      connect_timeout: 5s
+      connect_timeout: 0.5s
      type: LOGICAL_DNS
      dns_lookup_family: V4_ONLY
      lb_policy: ROUND_ROBIN
@ -621,7 +619,7 @@ static_resources:
                  hostname: arch_prompt_gateway_listener

    - name: arch_listener_llm
-      connect_timeout: 5s
+      connect_timeout: 0.5s
      type: LOGICAL_DNS
      dns_lookup_family: V4_ONLY
      lb_policy: ROUND_ROBIN
--- a/arch/tools/cli/config_generator.py
+++ b/arch/tools/cli/config_generator.py
@ -104,7 +104,27 @@ def validate_and_render_schema():
    arch_config_string = yaml.dump(config_yaml)
    arch_llm_config_string = yaml.dump(config_yaml)

+    prompt_gateway_listener = config_yaml.get("listeners", {}).get(
+        "ingress_traffic", {}
+    )
+    if prompt_gateway_listener.get("port") == None:
+        prompt_gateway_listener["port"] = 10000  # default port for prompt gateway
+    if prompt_gateway_listener.get("address") == None:
+        prompt_gateway_listener["address"] = "127.0.0.1"
+    if prompt_gateway_listener.get("timeout") == None:
+        prompt_gateway_listener["timeout"] = "10s"
+
+    llm_gateway_listener = config_yaml.get("listeners", {}).get("egress_traffic", {})
+    if llm_gateway_listener.get("port") == None:
+        llm_gateway_listener["port"] = 12000  # default port for llm gateway
+    if llm_gateway_listener.get("address") == None:
+        llm_gateway_listener["address"] = "127.0.0.1"
+    if llm_gateway_listener.get("timeout") == None:
+        llm_gateway_listener["timeout"] = "10s"
+
    data = {
+        "prompt_gateway_listener": prompt_gateway_listener,
+        "llm_gateway_listener": llm_gateway_listener,
        "arch_config": arch_config_string,
        "arch_llm_config": arch_llm_config_string,
        "arch_clusters": inferred_clusters,
--- a/arch/tools/cli/core.py
+++ b/arch/tools/cli/core.py
@ -2,6 +2,8 @@ import subprocess
 import os
 import time
 import sys
+
+import yaml
 from cli.utils import getLogger
 from cli.consts import (
    ARCHGW_DOCKER_NAME,
@ -22,6 +24,29 @@ from cli.docker_cli import (
 log = getLogger(__name__)


+def _get_gateway_ports(arch_config_file: str) -> tuple:
+    PROMPT_GATEWAY_DEFAULT_PORT = 10000
+    LLM_GATEWAY_DEFAULT_PORT = 12000
+
+    # parse arch_config_file yaml file and get prompt_gateway_port
+    arch_config_dict = {}
+    with open(arch_config_file) as f:
+        arch_config_dict = yaml.safe_load(f)
+
+    prompt_gateway_port = (
+        arch_config_dict.get("listeners", {})
+        .get("ingress_traffic", {})
+        .get("port", PROMPT_GATEWAY_DEFAULT_PORT)
+    )
+    llm_gateway_port = (
+        arch_config_dict.get("listeners", {})
+        .get("egress_traffic", {})
+        .get("port", LLM_GATEWAY_DEFAULT_PORT)
+    )
+
+    return prompt_gateway_port, llm_gateway_port
+
+
 def start_arch(arch_config_file, env, log_timeout=120, foreground=False):
    """
    Start Docker Compose in detached mode and stream logs until services are healthy.
@ -39,8 +64,14 @@ def start_arch(arch_config_file, env, log_timeout=120, foreground=False):
            docker_stop_container(ARCHGW_DOCKER_NAME)
            docker_remove_container(ARCHGW_DOCKER_NAME)

+        prompt_gateway_port, llm_gateway_port = _get_gateway_ports(arch_config_file)
+
        return_code, _, archgw_stderr = docker_start_archgw_detached(
-            arch_config_file, os.path.expanduser("~/archgw_logs"), env
+            arch_config_file,
+            os.path.expanduser("~/archgw_logs"),
+            env,
+            prompt_gateway_port,
+            llm_gateway_port,
        )
        if return_code != 0:
            log.info("Failed to start arch gateway: " + str(return_code))
@ -50,7 +81,7 @@ def start_arch(arch_config_file, env, log_timeout=120, foreground=False):
        start_time = time.time()
        while True:
            health_check_status = health_check_endpoint(
-                "http://localhost:10000/healthz"
+                f"http://localhost:{prompt_gateway_port}/healthz"
            )
            archgw_status = docker_container_status(ARCHGW_DOCKER_NAME)
            current_time = time.time()
--- a/arch/tools/cli/docker_cli.py
+++ b/arch/tools/cli/docker_cli.py
@ -1,7 +1,7 @@
 import subprocess
 import json
 import sys
-import requests  # Add this import
+import requests

 from cli.consts import ARCHGW_DOCKER_IMAGE, ARCHGW_DOCKER_NAME
 from cli.utils import getLogger
@ -33,11 +33,19 @@ def docker_remove_container(container: str) -> str:


 def docker_start_archgw_detached(
-    arch_config_file: str, logs_path_abs: str, env: dict
+    arch_config_file: str,
+    logs_path_abs: str,
+    env: dict,
+    prompt_gateway_port,
+    llm_gateway_port,
 ) -> str:
    env_args = [item for key, value in env.items() for item in ["-e", f"{key}={value}"]]

-    port_mappings = ["10000:10000", "12000:12000", "9901:19901"]
+    port_mappings = [
+        f"{prompt_gateway_port}:{prompt_gateway_port}",
+        f"{llm_gateway_port}:{llm_gateway_port}",
+        "9901:19901",
+    ]
    port_mappings_args = [item for port in port_mappings for item in ("-p", port)]

    volume_mappings = [
--- a/arch/tools/poetry.lock
+++ b/arch/tools/poetry.lock
@ -318,6 +318,28 @@ files = [
    {file = "distro-1.9.0.tar.gz", hash = "sha256:2fa77c6fd8940f116ee1d6b94a2f90b13b5ea8d019b98bc8bafdcabcdd9bdbed"},
 ]

+[[package]]
+name = "docker"
+version = "7.1.0"
+description = "A Python library for the Docker Engine API."
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "docker-7.1.0-py3-none-any.whl", hash = "sha256:c96b93b7f0a746f9e77d325bcfb87422a3d8bd4f03136ae8a85b37f1898d5fc0"},
+    {file = "docker-7.1.0.tar.gz", hash = "sha256:ad8c70e6e3f8926cb8a92619b832b4ea5299e2831c14284663184e200546fa6c"},
+]
+
+[package.dependencies]
+pywin32 = {version = ">=304", markers = "sys_platform == \"win32\""}
+requests = ">=2.26.0"
+urllib3 = ">=1.26.0"
+
+[package.extras]
+dev = ["coverage (==7.2.7)", "pytest (==7.4.2)", "pytest-cov (==4.1.0)", "pytest-timeout (==2.1.0)", "ruff (==0.1.8)"]
+docs = ["myst-parser (==0.18.0)", "sphinx (==5.1.1)"]
+ssh = ["paramiko (>=2.4.3)"]
+websockets = ["websocket-client (>=1.3.0)"]
+
 [[package]]
 name = "exceptiongroup"
 version = "1.2.2"
@ -1602,6 +1624,20 @@ files = [
 [package.dependencies]
 six = ">=1.5"

+[[package]]
+name = "python-dotenv"
+version = "1.0.1"
+description = "Read key-value pairs from a .env file and set them as environment variables"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "python-dotenv-1.0.1.tar.gz", hash = "sha256:e324ee90a023d808f1959c46bcbc04446a10ced277783dc6ee09987c37ec10ca"},
+    {file = "python_dotenv-1.0.1-py3-none-any.whl", hash = "sha256:f7b63ef50f1b690dddf550d03497b66d609393b40b564ed0d674909a68ebf16a"},
+]
+
+[package.extras]
+cli = ["click (>=5.0)"]
+
 [[package]]
 name = "pytz"
 version = "2025.1"
@ -1613,6 +1649,33 @@ files = [
    {file = "pytz-2025.1.tar.gz", hash = "sha256:c2db42be2a2518b28e65f9207c4d05e6ff547d1efa4086469ef855e4ab70178e"},
 ]

+[[package]]
+name = "pywin32"
+version = "308"
+description = "Python for Window Extensions"
+optional = false
+python-versions = "*"
+files = [
+    {file = "pywin32-308-cp310-cp310-win32.whl", hash = "sha256:796ff4426437896550d2981b9c2ac0ffd75238ad9ea2d3bfa67a1abd546d262e"},
+    {file = "pywin32-308-cp310-cp310-win_amd64.whl", hash = "sha256:4fc888c59b3c0bef905ce7eb7e2106a07712015ea1c8234b703a088d46110e8e"},
+    {file = "pywin32-308-cp310-cp310-win_arm64.whl", hash = "sha256:a5ab5381813b40f264fa3495b98af850098f814a25a63589a8e9eb12560f450c"},
+    {file = "pywin32-308-cp311-cp311-win32.whl", hash = "sha256:5d8c8015b24a7d6855b1550d8e660d8daa09983c80e5daf89a273e5c6fb5095a"},
+    {file = "pywin32-308-cp311-cp311-win_amd64.whl", hash = "sha256:575621b90f0dc2695fec346b2d6302faebd4f0f45c05ea29404cefe35d89442b"},
+    {file = "pywin32-308-cp311-cp311-win_arm64.whl", hash = "sha256:100a5442b7332070983c4cd03f2e906a5648a5104b8a7f50175f7906efd16bb6"},
+    {file = "pywin32-308-cp312-cp312-win32.whl", hash = "sha256:587f3e19696f4bf96fde9d8a57cec74a57021ad5f204c9e627e15c33ff568897"},
+    {file = "pywin32-308-cp312-cp312-win_amd64.whl", hash = "sha256:00b3e11ef09ede56c6a43c71f2d31857cf7c54b0ab6e78ac659497abd2834f47"},
+    {file = "pywin32-308-cp312-cp312-win_arm64.whl", hash = "sha256:9b4de86c8d909aed15b7011182c8cab38c8850de36e6afb1f0db22b8959e3091"},
+    {file = "pywin32-308-cp313-cp313-win32.whl", hash = "sha256:1c44539a37a5b7b21d02ab34e6a4d314e0788f1690d65b48e9b0b89f31abbbed"},
+    {file = "pywin32-308-cp313-cp313-win_amd64.whl", hash = "sha256:fd380990e792eaf6827fcb7e187b2b4b1cede0585e3d0c9e84201ec27b9905e4"},
+    {file = "pywin32-308-cp313-cp313-win_arm64.whl", hash = "sha256:ef313c46d4c18dfb82a2431e3051ac8f112ccee1a34f29c263c583c568db63cd"},
+    {file = "pywin32-308-cp37-cp37m-win32.whl", hash = "sha256:1f696ab352a2ddd63bd07430080dd598e6369152ea13a25ebcdd2f503a38f1ff"},
+    {file = "pywin32-308-cp37-cp37m-win_amd64.whl", hash = "sha256:13dcb914ed4347019fbec6697a01a0aec61019c1046c2b905410d197856326a6"},
+    {file = "pywin32-308-cp38-cp38-win32.whl", hash = "sha256:5794e764ebcabf4ff08c555b31bd348c9025929371763b2183172ff4708152f0"},
+    {file = "pywin32-308-cp38-cp38-win_amd64.whl", hash = "sha256:3b92622e29d651c6b783e368ba7d6722b1634b8e70bd376fd7610fe1992e19de"},
+    {file = "pywin32-308-cp39-cp39-win32.whl", hash = "sha256:7873ca4dc60ab3287919881a7d4f88baee4a6e639aa6962de25a98ba6b193341"},
+    {file = "pywin32-308-cp39-cp39-win_amd64.whl", hash = "sha256:71b3322d949b4cc20776436a9c9ba0eeedcbc9c650daa536df63f0ff111bb920"},
+]
+
 [[package]]
 name = "pyyaml"
 version = "6.0.2"
@ -2481,4 +2544,4 @@ type = ["pytest-mypy"]
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.10"
-content-hash = "d4fb144073d4f8abcd8972892545d8ce47692d303ffa63dfe9b9bcdd0aea96f2"
+content-hash = "d02e43f0884294d48736e1b8df248f47af480baffcbb7a0194da4e16cc1ea502"
--- a/arch/tools/pyproject.toml
+++ b/arch/tools/pyproject.toml
@ -15,6 +15,9 @@ click = "^8.1.7"
 jinja2 = "^3.1.4"
 jsonschema = "^4.23.0"
 setuptools = "75.5.0"
+docker = "^7.1.0"
+python-dotenv = "^1.0.1"
+pyyaml = "^6.0.2"

 [tool.poetry.scripts]
 archgw = "cli.main:main"
--- a/arch/validate_arch_config.sh
+++ b/arch/validate_arch_config.sh
@ -0,0 +1,20 @@
+#!/bin/bash
+
+failed_files=()
+
+for file in $(find . -name arch_config.yaml -o -name arch_config_full_reference.yaml); do
+  echo "Validating $file..."
+  if ! docker run --rm -v "$(pwd)/$file:/app/arch_config.yaml:ro" --entrypoint /bin/sh katanemo/archgw:latest -c "python config_generator.py" 2>&1 > /dev/null ; then
+    echo "Validation failed for $file"
+    failed_files+=("$file")
+  fi
+done
+
+# Print summary of failed files
+if [ ${#failed_files[@]} -ne 0 ]; then
+  echo -e "\nValidation failed for the following files:"
+  printf '%s\n' "${failed_files[@]}"
+  exit 1
+else
+  echo -e "\nAll files validated successfully!"
+fi
--- a/crates/common/src/configuration.rs
+++ b/crates/common/src/configuration.rs
@ -9,7 +9,6 @@ use crate::api::open_ai::{
 #[derive(Debug, Clone, Serialize, Deserialize)]
 pub struct Configuration {
    pub version: String,
-    pub listener: Listener,
    pub endpoints: Option<HashMap<String, Endpoint>>,
    pub llm_providers: Vec<LlmProvider>,
    pub overrides: Option<Overrides>,
@ -48,32 +47,6 @@ pub struct ErrorTargetDetail {
    pub endpoint: Option<EndpointDetails>,
 }

-#[derive(Debug, Clone, Serialize, Deserialize)]
-pub struct Listener {
-    pub address: String,
-    pub port: u16,
-    pub message_format: MessageFormat,
-    // pub connect_timeout: Option<DurationString>,
-}
-
-impl Default for Listener {
-    fn default() -> Self {
-        Listener {
-            address: "".to_string(),
-            port: 0,
-            message_format: MessageFormat::default(),
-            // connect_timeout: None,
-        }
-    }
-}
-
-#[derive(Debug, Clone, Serialize, Deserialize, Default)]
-pub enum MessageFormat {
-    #[serde(rename = "huggingface")]
-    #[default]
-    Huggingface,
-}
-
 #[derive(Debug, Clone, Serialize, Deserialize, Default)]
 pub struct PromptGuards {
    pub input_guards: HashMap<GuardType, GuardOptions>,
@ -353,16 +326,6 @@ mod test {
            Some("/agent/summary".to_string())
        );

-        let error_target = config.error_target.as_ref().unwrap();
-        assert_eq!(
-            error_target.endpoint.as_ref().unwrap().name,
-            "error_target_1".to_string()
-        );
-        assert_eq!(
-            error_target.endpoint.as_ref().unwrap().path,
-            Some("/error".to_string())
-        );
-
        let tracing = config.tracing.as_ref().unwrap();
        assert_eq!(tracing.sampling_rate.unwrap(), 0.1);

--- a/crates/common/src/consts.rs
+++ b/crates/common/src/consts.rs
@ -3,7 +3,10 @@ pub const SYSTEM_ROLE: &str = "system";
 pub const USER_ROLE: &str = "user";
 pub const TOOL_ROLE: &str = "tool";
 pub const ASSISTANT_ROLE: &str = "assistant";
-pub const ARCH_FC_REQUEST_TIMEOUT_MS: u64 = 120000; // 2 minutes
+pub const ARCH_FC_REQUEST_TIMEOUT_MS: u64 = 30000; // 30 seconds
+pub const DEFAULT_TARGET_REQUEST_TIMEOUT_MS: u64 = 30000; // 30 seconds
+pub const API_REQUEST_TIMEOUT_MS: u64 = 30000; // 30 seconds
+pub const MODEL_SERVER_REQUEST_TIMEOUT_MS: u64 = 30000; // 30 seconds
 pub const MODEL_SERVER_NAME: &str = "model_server";
 pub const ARCH_ROUTING_HEADER: &str = "x-arch-llm-provider";
 pub const MESSAGES_KEY: &str = "messages";
--- a/crates/common/src/tracing.rs
+++ b/crates/common/src/tracing.rs
@ -166,7 +166,7 @@ impl TraceData {
                attributes: vec![Attribute {
                    key: "service.name".to_string(),
                    value: AttributeValue {
-                        string_value: Some("upstream-llm".to_string()),
+                        string_value: Some("egress_llm_traffic".to_string()),
                    },
                }],
            };
--- a/crates/llm_gateway/src/stream_context.rs
+++ b/crates/llm_gateway/src/stream_context.rs
@ -381,7 +381,7 @@ impl HttpContext for StreamContext {
                    Ok(traceparent) => {
                        let mut trace_data = common::tracing::TraceData::new();
                        let mut llm_span = Span::new(
-                            "upstream_llm_time".to_string(),
+                            "egress_traffic".to_string(),
                            Some(traceparent.trace_id),
                            Some(traceparent.parent_id),
                            self.request_body_sent_time.unwrap(),
--- a/crates/prompt_gateway/src/http_context.rs
+++ b/crates/prompt_gateway/src/http_context.rs
@ -6,7 +6,8 @@ use common::{
    consts::{
        ARCH_FC_MODEL_NAME, ARCH_INTERNAL_CLUSTER_NAME, ARCH_STATE_HEADER,
        ARCH_UPSTREAM_HOST_HEADER, ASSISTANT_ROLE, CHAT_COMPLETIONS_PATH, HEALTHZ_PATH,
-        MODEL_SERVER_NAME, REQUEST_ID_HEADER, TOOL_ROLE, TRACE_PARENT_HEADER, USER_ROLE,
+        MODEL_SERVER_NAME, MODEL_SERVER_REQUEST_TIMEOUT_MS, REQUEST_ID_HEADER, TOOL_ROLE,
+        TRACE_PARENT_HEADER, USER_ROLE,
    },
    errors::ServerError,
    http::{CallArgs, Client},
@ -144,7 +145,10 @@ impl HttpContext for StreamContext {
                if metadata.is_none() {
                    metadata = Some(HashMap::new());
                }
-                metadata.as_mut().unwrap().insert("optimize_context_window".to_string(), "true".to_string());
+                metadata
+                    .as_mut()
+                    .unwrap()
+                    .insert("optimize_context_window".to_string(), "true".to_string());
            }
        }

@ -170,12 +174,15 @@ impl HttpContext for StreamContext {
        debug!("sending request to model server");
        trace!("request body: {}", json_data);

+        let timeout_str = MODEL_SERVER_REQUEST_TIMEOUT_MS.to_string();
+
        let mut headers = vec![
            (ARCH_UPSTREAM_HOST_HEADER, MODEL_SERVER_NAME),
            (":method", "POST"),
            (":path", "/function_calling"),
            ("content-type", "application/json"),
            (":authority", MODEL_SERVER_NAME),
+            ("x-envoy-upstream-rq-timeout-ms", timeout_str.as_str()),
        ];

        if self.request_id.is_some() {
--- a/crates/prompt_gateway/src/stream_context.rs
+++ b/crates/prompt_gateway/src/stream_context.rs
@ -6,9 +6,9 @@ use common::api::open_ai::{
 };
 use common::configuration::{Overrides, PromptTarget, Tracing};
 use common::consts::{
-    ARCH_FC_MODEL_NAME, ARCH_FC_REQUEST_TIMEOUT_MS, ARCH_INTERNAL_CLUSTER_NAME,
-    ARCH_UPSTREAM_HOST_HEADER, ASSISTANT_ROLE, MESSAGES_KEY, REQUEST_ID_HEADER, SYSTEM_ROLE,
-    TOOL_ROLE, TRACE_PARENT_HEADER, USER_ROLE,
+    API_REQUEST_TIMEOUT_MS, ARCH_FC_MODEL_NAME, ARCH_INTERNAL_CLUSTER_NAME,
+    ARCH_UPSTREAM_HOST_HEADER, ASSISTANT_ROLE, DEFAULT_TARGET_REQUEST_TIMEOUT_MS, MESSAGES_KEY,
+    REQUEST_ID_HEADER, SYSTEM_ROLE, TOOL_ROLE, TRACE_PARENT_HEADER, USER_ROLE,
 };
 use common::errors::ServerError;
 use common::http::{CallArgs, Client};
@ -89,7 +89,7 @@ impl StreamContext {
            streaming_response: false,
            user_prompt: None,
            is_chat_completions_request: false,
-            overrides: overrides,
+            overrides,
            request_id: None,
            traceparent: None,
            _tracing: tracing,
@ -160,7 +160,7 @@ impl StreamContext {
                            callout_context.request_body.messages.clone(),
                        );
                        let arch_messages_json = serde_json::to_string(&params).unwrap();
-                        let timeout_str = ARCH_FC_REQUEST_TIMEOUT_MS.to_string();
+                        let timeout_str = DEFAULT_TARGET_REQUEST_TIMEOUT_MS.to_string();

                        let mut headers = vec![
                            (":method", "POST"),
@ -302,6 +302,8 @@ impl StreamContext {
            }
        };

+        let timeout_str = API_REQUEST_TIMEOUT_MS.to_string();
+
        let http_method_str = http_method.to_string();
        let mut headers: HashMap<_, _> = [
            (ARCH_UPSTREAM_HOST_HEADER, endpoint_details.name.as_str()),
@ -310,6 +312,7 @@ impl StreamContext {
            (":authority", endpoint_details.name.as_str()),
            ("content-type", "application/json"),
            ("x-envoy-max-retries", "3"),
+            ("x-envoy-upstream-rq-timeout-ms", timeout_str.as_str()),
        ]
        .into_iter()
        .collect();
--- a/crates/prompt_gateway/tests/integration.rs
+++ b/crates/prompt_gateway/tests/integration.rs
@ -81,10 +81,11 @@ fn normal_flow(module: &mut Tester, filter_context: i32, http_context: i32) {
                (":path", "/function_calling"),
                ("content-type", "application/json"),
                (":authority", "model_server"),
+                ("x-envoy-upstream-rq-timeout-ms", "30000"),
            ]),
            None,
            None,
-            None,
+            Some(5000),
        )
        .returning(Some(1))
        .expect_log(Some(LogLevel::Trace), None)
@ -387,10 +388,11 @@ fn prompt_gateway_request_to_llm_gateway() {
                (":authority", "api_server"),
                ("x-envoy-max-retries", "3"),
                (":path", "/weather"),
+                ("x-envoy-upstream-rq-timeout-ms", "30000"),
            ]),
            Some(expected_body),
            None,
-            None,
+            Some(5000),
        )
        .returning(Some(2))
        .expect_metric_increment("active_http_calls", 1)
--- a/demos/samples_java/weather_forcecast_service/arch_config.yaml
+++ b/demos/samples_java/weather_forcecast_service/arch_config.yaml
@ -1,8 +1,10 @@
 version: v0.1
-listener:
-  address: 127.0.0.1
-  port: 10000 #If you configure port 443, you'll need to update the listener with tls_certificates
-  message_format: huggingface
+listeners:
+  ingress_traffic:
+    address: 0.0.0.0
+    port: 10000
+    message_format: openai
+    timeout: 30s

 # Centralized way to manage LLMs, manage keys, retry logic, failover and limits in a central way
 llm_providers:
--- a/demos/samples_python/currency_exchange/arch_config.yaml
+++ b/demos/samples_python/currency_exchange/arch_config.yaml
@ -1,10 +1,11 @@
 version: v0.1

-listener:
-  address: 0.0.0.0
-  port: 10000
-  message_format: huggingface
-  connect_timeout: 0.005s
+listeners:
+  ingress_traffic:
+    address: 0.0.0.0
+    port: 10000
+    message_format: openai
+    timeout: 30s

 llm_providers:
  - name: gpt-4o
--- a/demos/samples_python/human_resources_agent/arch_config.yaml
+++ b/demos/samples_python/human_resources_agent/arch_config.yaml
@ -1,8 +1,10 @@
 version: v0.1
-listener:
-  address: 127.0.0.1
-  port: 8080 #If you configure port 443, you'll need to update the listener with tls_certificates
-  message_format: huggingface
+listeners:
+  ingress_traffic:
+    address: 0.0.0.0
+    port: 10000
+    message_format: openai
+    timeout: 30s

 # Centralized way to manage LLMs, manage keys, retry logic, failover and limits in a central way
 llm_providers:
--- a/demos/samples_python/multi_turn_rag_agent/arch_config.yaml
+++ b/demos/samples_python/multi_turn_rag_agent/arch_config.yaml
@ -1,10 +1,11 @@
 version: v0.1

-listener:
-  address: 127.0.0.1
-  port: 10000
-  message_format: huggingface
-  connect_timeout: 0.005s
+listeners:
+  ingress_traffic:
+    address: 0.0.0.0
+    port: 10000
+    message_format: openai
+    timeout: 30s

 endpoints:
  rag_energy_source_agent:
--- a/demos/samples_python/network_switch_operator_agent/arch_config.yaml
+++ b/demos/samples_python/network_switch_operator_agent/arch_config.yaml
@ -1,8 +1,10 @@
 version: v0.1
-listener:
-  address: 127.0.0.1
-  port: 8080 #If you configure port 443, you'll need to update the listener with tls_certificates
-  message_format: huggingface
+listeners:
+  ingress_traffic:
+    address: 0.0.0.0
+    port: 10000
+    message_format: openai
+    timeout: 30s

 # Centralized way to manage LLMs, manage keys, retry logic, failover and limits in a central way
 llm_providers:
--- a/demos/samples_python/stock_quote/arch_config.yaml
+++ b/demos/samples_python/stock_quote/arch_config.yaml
@ -1,10 +1,11 @@
 version: v0.1

-listener:
-  address: 0.0.0.0
-  port: 10000
-  message_format: huggingface
-  connect_timeout: 0.005s
+listeners:
+  ingress_traffic:
+    address: 0.0.0.0
+    port: 10000
+    message_format: openai
+    timeout: 30s

 llm_providers:
  - name: gpt-4o
--- a/demos/samples_python/weather_forecast/arch_config.yaml
+++ b/demos/samples_python/weather_forecast/arch_config.yaml
@ -1,10 +1,11 @@
 version: "0.1-beta"

-listener:
-  address: 0.0.0.0
-  port: 10000
-  message_format: huggingface
-  connect_timeout: 0.005s
+listeners:
+  ingress_traffic:
+    address: 0.0.0.0
+    port: 10000
+    message_format: openai
+    timeout: 30s

 endpoints:
  weather_forecast_service:
--- a/demos/use_cases/llm_routing/arch_config.yaml
+++ b/demos/use_cases/llm_routing/arch_config.yaml
@ -1,10 +1,11 @@
 version: "0.1-beta"

-listener:
-  address: 0.0.0.0
-  port: 10000
-  message_format: huggingface
-  connect_timeout: 0.005s
+listeners:
+  egress_traffic:
+    address: 0.0.0.0
+    port: 12000
+    message_format: openai
+    timeout: 30s

 llm_providers:
  - name: gpt-4o-mini
--- a/demos/use_cases/ollama/arch_config.yaml
+++ b/demos/use_cases/ollama/arch_config.yaml
@ -1,10 +1,11 @@
 version: v0.1

-listener:
-  address: 0.0.0.0
-  port: 10000
-  message_format: huggingface
-  connect_timeout: 0.005s
+listeners:
+  ingress_traffic:
+    address: 0.0.0.0
+    port: 10000
+    message_format: openai
+    timeout: 30s

 llm_providers:

--- a/demos/use_cases/spotify_bearer_auth/arch_config.yaml
+++ b/demos/use_cases/spotify_bearer_auth/arch_config.yaml
@ -1,8 +1,10 @@
 version: v0.1
-listener:
-  address: 127.0.0.1
-  port: 8080 #If you configure port 443, you'll need to update the listener with tls_certificates
-  message_format: huggingface
+listeners:
+  ingress_traffic:
+    address: 0.0.0.0
+    port: 10000
+    message_format: openai
+    timeout: 30s

 overrides:
  optimize_context_window: true
--- a/docs/source/concepts/includes/arch_config.yaml
+++ b/docs/source/concepts/includes/arch_config.yaml
@ -1,10 +1,11 @@
 version: v0.1

-listener:
-  address: 0.0.0.0 # or 127.0.0.1
-  port: 10000
-  # Defines how Arch should parse the content from application/json or text/pain Content-type in the http request
-  message_format: huggingface
+listeners:
+  ingress_traffic:
+    address: 0.0.0.0
+    port: 10000
+    message_format: openai
+    timeout: 30s

 # Centralized way to manage LLMs, manage keys, retry logic, failover and limits in a central way
 llm_providers:
@ -13,7 +14,6 @@ llm_providers:
    access_key: $OPENAI_API_KEY
    model: gpt-4o
    default: true
-    stream: true

 # default system prompt used by all prompt targets
 system_prompt: You are a network assistant that just offers facts; not advice on manufacturers or purchasing decisions.
@ -52,11 +52,6 @@ prompt_targets:
        default: false
        enum: [true, false]

-error_target:
-  endpoint:
-    name: error_target_1
-    path: /error
-
 # Arch creates a round-robin load balancing between different endpoints, managed via the cluster subsystem.
 endpoints:
  app_server:
--- a/docs/source/get_started/quickstart.rst
+++ b/docs/source/get_started/quickstart.rst
@ -42,11 +42,12 @@ Create ``arch_config.yaml`` file with the following content:

   version: v0.1

-   listener:
-     address: 0.0.0.0
-     port: 10000
-     message_format: huggingface
-     connect_timeout: 0.005s
+  listeners:
+    ingress_traffic:
+      address: 0.0.0.0
+      port: 10000
+      message_format: openai
+      timeout: 30s

   llm_providers:
     - name: gpt-4o
@ -144,22 +145,23 @@ Create ``arch_config.yaml`` file with the following content:

   version: v0.1

-   listener:
-     address: 0.0.0.0
-     port: 10000
-     message_format: huggingface
-     connect_timeout: 0.005s
+  listeners:
+    egress_traffic:
+      address: 0.0.0.0
+      port: 12000
+      message_format: openai
+      timeout: 30s

   llm_providers:
     - name: gpt-4o
       access_key: $OPENAI_API_KEY
-       provider: openai
+       provider_interface: openai
       model: gpt-4o
       default: true

     - name: ministral-3b
       access_key: $MISTRAL_API_KEY
-       provider: mistral
+       provider_interface: openai
       model: ministral-3b-latest

 Step 2. Start arch gateway
--- a/docs/source/guides/includes/arch_config.yaml
+++ b/docs/source/guides/includes/arch_config.yaml
@ -1,10 +1,11 @@
 version: v0.1

-listener:
-  address: 0.0.0.0 # or 127.0.0.1
-  port: 10000
-  # Defines how Arch should parse the content from application/json or text/pain Content-type in the http request
-  message_format: huggingface
+listeners:
+  ingress_traffic:
+    address: 0.0.0.0
+    port: 10000
+    message_format: openai
+    timeout: 30s

 # Centralized way to manage LLMs, manage keys, retry logic, failover and limits in a central way
 llm_providers:
@ -13,7 +14,6 @@ llm_providers:
    access_key: $OPENAI_API_KEY
    model: gpt-4o
    default: true
-    stream: true

 # default system prompt used by all prompt targets
 system_prompt: You are a network assistant that just offers facts; not advice on manufacturers or purchasing decisions.
@ -54,11 +54,6 @@ prompt_targets:
        default: false
        enum: [true, false]

-error_target:
-  endpoint:
-    name: error_target_1
-    path: /error
-
 # Arch creates a round-robin load balancing between different endpoints, managed via the cluster subsystem.
 endpoints:
  app_server:
--- a/docs/source/resources/includes/arch_config_full_reference.yaml
+++ b/docs/source/resources/includes/arch_config_full_reference.yaml
@ -1,16 +1,16 @@
 version: v0.1

-listener:
-  address: 0.0.0.0 # or 127.0.0.1
-  port: 10000
-  # Defines how Arch should parse the content from application/json or text/pain Content-type in the http request
-  message_format: huggingface
-  common_tls_context: # If you configure port 443, you'll need to update the listener with your TLS certificates
-    tls_certificates:
-      - certificate_chain:
-          filename: /etc/certs/cert.pem
-        private_key:
-          filename: /etc/certs/key.pem
+listeners:
+  ingress_traffic:
+    address: 0.0.0.0
+    port: 10000
+    message_format: openai
+    timeout: 5s
+  egress_traffic:
+    address: 0.0.0.0
+    port: 12000
+    message_format: openai
+    timeout: 5s

 # Arch creates a round-robin load balancing between different endpoints, managed via the cluster subsystem.
 endpoints:
@ -35,15 +35,6 @@ llm_providers:
    access_key: $OPENAI_API_KEY
    model: gpt-4o
    default: true
-    stream: true
-    rate_limits:
-      selector: #optional headers, to add rate limiting based on http headers like JWT tokens or API keys
-        http_header:
-          name: Authorization
-          value: "" # Empty value means each separate value has a separate limit
-      limit:
-        tokens: 100000 # Tokens per unit
-        unit: minute

  - name: Mistral8x7b
    provider_interface: openai
@ -99,11 +90,6 @@ prompt_targets:
        default: false
        enum: [true, false]

-error_target:
-  endpoint:
-    name: error_target_1
-    path: /error
-
 tracing:
  # sampling rate. Note by default Arch works on OpenTelemetry compatible tracing.
  sampling_rate: 0.1
--- a/tests/archgw/arch_config.yaml
+++ b/tests/archgw/arch_config.yaml
@ -1,10 +1,11 @@
 version: "0.1-beta"

-listener:
-  address: 0.0.0.0
-  port: 10000
-  message_format: huggingface
-  connect_timeout: 0.005s
+listeners:
+  ingress_traffic:
+    address: 0.0.0.0
+    port: 10000
+    message_format: openai
+    timeout: 30s

 endpoints:
  weather_forecast_service: