diff --git a/.github/workflows/validate_arch_config.yml b/.github/workflows/validate_arch_config.yml new file mode 100644 index 00000000..9503dad2 --- /dev/null +++ b/.github/workflows/validate_arch_config.yml @@ -0,0 +1,31 @@ +name: arch config tests + +on: + push: + branches: + - main + pull_request: + +jobs: + validate_arch_config: + runs-on: ubuntu-latest + defaults: + run: + working-directory: . + + steps: + - name: Checkout code + uses: actions/checkout@v3 + + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: "3.12" + + - name: build arch docker image + run: | + docker build -f arch/Dockerfile . -t katanemo/archgw + + - name: validate arch config + run: | + bash arch/validate_arch_config.sh diff --git a/arch/arch_config_schema.yaml b/arch/arch_config_schema.yaml index 1b32b730..1432c0b9 100644 --- a/arch/arch_config_schema.yaml +++ b/arch/arch_config_schema.yaml @@ -3,21 +3,38 @@ type: object properties: version: type: string - listener: + listeners: type: object - properties: - address: - type: string - port: - type: integer - message_format: - type: string - connect_timeout: - type: string additionalProperties: false - required: - - address - - port + properties: + ingress_traffic: + type: object + properties: + address: + type: string + port: + type: integer + message_format: + type: string + enum: + - openai + timeout: + type: string + additionalProperties: false + egress_traffic: + type: object + properties: + address: + type: string + port: + type: integer + message_format: + type: string + enum: + - openai + timeout: + type: string + additionalProperties: false endpoints: type: object patternProperties: @@ -107,7 +124,10 @@ properties: required: type: boolean default: - type: string + anyOf: + - type: string + - type: integer + - type: boolean description: type: string type: @@ -115,7 +135,10 @@ properties: enum: type: array items: - type: string + anyOf: + - type: string + - type: integer + - type: boolean in_path: type: boolean format: @@ -224,5 +247,4 @@ properties: additionalProperties: false required: - version - - listener - llm_providers diff --git a/arch/envoy.template.yaml b/arch/envoy.template.yaml index 0040b57b..ca722b7c 100644 --- a/arch/envoy.template.yaml +++ b/arch/envoy.template.yaml @@ -29,11 +29,11 @@ stats_config: - 180000 static_resources: listeners: - - name: arch_listener_http + - name: ingress_traffic address: socket_address: - address: 0.0.0.0 - port_value: 10000 + address: {{ prompt_gateway_listener.address }} + port_value: {{ prompt_gateway_listener.port }} traffic_direction: INBOUND filter_chains: - filters: @@ -55,7 +55,7 @@ static_resources: random_sampling: value: {{ arch_tracing.random_sampling }} {% endif %} - stat_prefix: arch_listener_http + stat_prefix: ingress_traffic codec_type: AUTO scheme_header_transformation: scheme_to_overwrite: https @@ -76,13 +76,13 @@ static_resources: route: auto_host_rewrite: true cluster: arch_prompt_gateway_listener - timeout: 60s + timeout: {{ prompt_gateway_listener.timeout }} http_filters: - name: envoy.filters.http.router typed_config: "@type": type.googleapis.com/envoy.extensions.filters.http.router.v3.Router - - name: arch_prompt_gateway_listener + - name: ingress_traffic_prompt address: socket_address: address: 0.0.0.0 @@ -104,11 +104,11 @@ static_resources: envoy_grpc: cluster_name: opentelemetry_collector timeout: 0.250s - service_name: prompt_processor + service_name: ingress_traffic random_sampling: value: {{ arch_tracing.random_sampling }} {% endif %} - stat_prefix: arch_prompt_gateway_listener + stat_prefix: ingress_traffic codec_type: AUTO scheme_header_transformation: scheme_to_overwrite: https @@ -201,7 +201,7 @@ static_resources: typed_config: "@type": type.googleapis.com/envoy.extensions.filters.http.router.v3.Router - - name: arch_internal + - name: egress_api_traffic address: socket_address: address: 0.0.0.0 @@ -223,11 +223,11 @@ static_resources: envoy_grpc: cluster_name: opentelemetry_collector timeout: 0.250s - service_name: prompt_processor + service_name: egress_api_traffic random_sampling: value: {{ arch_tracing.random_sampling }} {% endif %} - stat_prefix: arch_internal + stat_prefix: egress_api_traffic codec_type: AUTO scheme_header_transformation: scheme_to_overwrite: https @@ -273,13 +273,12 @@ static_resources: typed_config: "@type": type.googleapis.com/envoy.extensions.filters.http.router.v3.Router - - - name: arch_listener_http_llm + - name: egress_traffic address: socket_address: - address: 0.0.0.0 - port_value: 12000 - traffic_direction: INBOUND + address: {{ llm_gateway_listener.address }} + port_value: {{ llm_gateway_listener.port }} + traffic_direction: OUTBOUND filter_chains: - filters: - name: envoy.filters.network.http_connection_manager @@ -300,7 +299,7 @@ static_resources: random_sampling: value: {{ arch_tracing.random_sampling }} {% endif %} - stat_prefix: arch_listener_http + stat_prefix: egress_traffic codec_type: AUTO scheme_header_transformation: scheme_to_overwrite: https @@ -321,14 +320,13 @@ static_resources: route: auto_host_rewrite: true cluster: arch_listener_llm - timeout: 60s + timeout: {{ llm_gateway_listener.timeout }} http_filters: - name: envoy.filters.http.router typed_config: "@type": type.googleapis.com/envoy.extensions.filters.http.router.v3.Router - - - name: arch_listener_llm + - name: egress_traffic_llm address: socket_address: address: 0.0.0.0 @@ -349,11 +347,11 @@ static_resources: envoy_grpc: cluster_name: opentelemetry_collector timeout: 0.250s - service_name: llm_gateway + service_name: egress_traffic_llm random_sampling: value: {{ arch_tracing.random_sampling }} {% endif %} - stat_prefix: arch_listener_http + stat_prefix: egress_traffic codec_type: AUTO scheme_header_transformation: scheme_to_overwrite: https @@ -443,7 +441,7 @@ static_resources: clusters: - name: openai - connect_timeout: 5s + connect_timeout: 0.5s type: LOGICAL_DNS dns_lookup_family: V4_ONLY lb_policy: ROUND_ROBIN @@ -467,7 +465,7 @@ static_resources: tls_minimum_protocol_version: TLSv1_2 tls_maximum_protocol_version: TLSv1_3 - name: mistral - connect_timeout: 5s + connect_timeout: 0.5s type: LOGICAL_DNS dns_lookup_family: V4_ONLY lb_policy: ROUND_ROBIN @@ -488,7 +486,7 @@ static_resources: sni: api.mistral.ai {% for internal_cluster in ["arch_fc", "model_server"] %} - name: {{ internal_cluster }} - connect_timeout: 5s + connect_timeout: 0.5s type: STRICT_DNS dns_lookup_family: V4_ONLY lb_policy: ROUND_ROBIN @@ -504,7 +502,7 @@ static_resources: hostname: {{ internal_cluster }} {% endfor %} - name: mistral_7b_instruct - connect_timeout: 5s + connect_timeout: 0.5s type: STRICT_DNS dns_lookup_family: V4_ONLY lb_policy: ROUND_ROBIN @@ -523,7 +521,7 @@ static_resources: {% if cluster.connect_timeout -%} connect_timeout: {{ cluster.connect_timeout }} {% else -%} - connect_timeout: 5s + connect_timeout: 0.5s {% endif -%} type: LOGICAL_DNS dns_lookup_family: V4_ONLY @@ -557,7 +555,7 @@ static_resources: {% for local_llm_provider in local_llms %} - name: {{ local_llm_provider.name }} - connect_timeout: 5s + connect_timeout: 0.5s type: LOGICAL_DNS dns_lookup_family: V4_ONLY lb_policy: ROUND_ROBIN @@ -589,7 +587,7 @@ static_resources: {% endfor %} - name: arch_internal - connect_timeout: 5s + connect_timeout: 0.5s type: LOGICAL_DNS dns_lookup_family: V4_ONLY lb_policy: ROUND_ROBIN @@ -605,7 +603,7 @@ static_resources: hostname: arch_internal - name: arch_prompt_gateway_listener - connect_timeout: 5s + connect_timeout: 0.5s type: LOGICAL_DNS dns_lookup_family: V4_ONLY lb_policy: ROUND_ROBIN @@ -621,7 +619,7 @@ static_resources: hostname: arch_prompt_gateway_listener - name: arch_listener_llm - connect_timeout: 5s + connect_timeout: 0.5s type: LOGICAL_DNS dns_lookup_family: V4_ONLY lb_policy: ROUND_ROBIN diff --git a/arch/tools/cli/config_generator.py b/arch/tools/cli/config_generator.py index 447585fb..7392849e 100644 --- a/arch/tools/cli/config_generator.py +++ b/arch/tools/cli/config_generator.py @@ -104,7 +104,27 @@ def validate_and_render_schema(): arch_config_string = yaml.dump(config_yaml) arch_llm_config_string = yaml.dump(config_yaml) + prompt_gateway_listener = config_yaml.get("listeners", {}).get( + "ingress_traffic", {} + ) + if prompt_gateway_listener.get("port") == None: + prompt_gateway_listener["port"] = 10000 # default port for prompt gateway + if prompt_gateway_listener.get("address") == None: + prompt_gateway_listener["address"] = "127.0.0.1" + if prompt_gateway_listener.get("timeout") == None: + prompt_gateway_listener["timeout"] = "10s" + + llm_gateway_listener = config_yaml.get("listeners", {}).get("egress_traffic", {}) + if llm_gateway_listener.get("port") == None: + llm_gateway_listener["port"] = 12000 # default port for llm gateway + if llm_gateway_listener.get("address") == None: + llm_gateway_listener["address"] = "127.0.0.1" + if llm_gateway_listener.get("timeout") == None: + llm_gateway_listener["timeout"] = "10s" + data = { + "prompt_gateway_listener": prompt_gateway_listener, + "llm_gateway_listener": llm_gateway_listener, "arch_config": arch_config_string, "arch_llm_config": arch_llm_config_string, "arch_clusters": inferred_clusters, diff --git a/arch/tools/cli/core.py b/arch/tools/cli/core.py index 78d9499c..291ca808 100644 --- a/arch/tools/cli/core.py +++ b/arch/tools/cli/core.py @@ -2,6 +2,8 @@ import subprocess import os import time import sys + +import yaml from cli.utils import getLogger from cli.consts import ( ARCHGW_DOCKER_NAME, @@ -22,6 +24,29 @@ from cli.docker_cli import ( log = getLogger(__name__) +def _get_gateway_ports(arch_config_file: str) -> tuple: + PROMPT_GATEWAY_DEFAULT_PORT = 10000 + LLM_GATEWAY_DEFAULT_PORT = 12000 + + # parse arch_config_file yaml file and get prompt_gateway_port + arch_config_dict = {} + with open(arch_config_file) as f: + arch_config_dict = yaml.safe_load(f) + + prompt_gateway_port = ( + arch_config_dict.get("listeners", {}) + .get("ingress_traffic", {}) + .get("port", PROMPT_GATEWAY_DEFAULT_PORT) + ) + llm_gateway_port = ( + arch_config_dict.get("listeners", {}) + .get("egress_traffic", {}) + .get("port", LLM_GATEWAY_DEFAULT_PORT) + ) + + return prompt_gateway_port, llm_gateway_port + + def start_arch(arch_config_file, env, log_timeout=120, foreground=False): """ Start Docker Compose in detached mode and stream logs until services are healthy. @@ -39,8 +64,14 @@ def start_arch(arch_config_file, env, log_timeout=120, foreground=False): docker_stop_container(ARCHGW_DOCKER_NAME) docker_remove_container(ARCHGW_DOCKER_NAME) + prompt_gateway_port, llm_gateway_port = _get_gateway_ports(arch_config_file) + return_code, _, archgw_stderr = docker_start_archgw_detached( - arch_config_file, os.path.expanduser("~/archgw_logs"), env + arch_config_file, + os.path.expanduser("~/archgw_logs"), + env, + prompt_gateway_port, + llm_gateway_port, ) if return_code != 0: log.info("Failed to start arch gateway: " + str(return_code)) @@ -50,7 +81,7 @@ def start_arch(arch_config_file, env, log_timeout=120, foreground=False): start_time = time.time() while True: health_check_status = health_check_endpoint( - "http://localhost:10000/healthz" + f"http://localhost:{prompt_gateway_port}/healthz" ) archgw_status = docker_container_status(ARCHGW_DOCKER_NAME) current_time = time.time() diff --git a/arch/tools/cli/docker_cli.py b/arch/tools/cli/docker_cli.py index 2ef9eb5f..edb8f764 100644 --- a/arch/tools/cli/docker_cli.py +++ b/arch/tools/cli/docker_cli.py @@ -1,7 +1,7 @@ import subprocess import json import sys -import requests # Add this import +import requests from cli.consts import ARCHGW_DOCKER_IMAGE, ARCHGW_DOCKER_NAME from cli.utils import getLogger @@ -33,11 +33,19 @@ def docker_remove_container(container: str) -> str: def docker_start_archgw_detached( - arch_config_file: str, logs_path_abs: str, env: dict + arch_config_file: str, + logs_path_abs: str, + env: dict, + prompt_gateway_port, + llm_gateway_port, ) -> str: env_args = [item for key, value in env.items() for item in ["-e", f"{key}={value}"]] - port_mappings = ["10000:10000", "12000:12000", "9901:19901"] + port_mappings = [ + f"{prompt_gateway_port}:{prompt_gateway_port}", + f"{llm_gateway_port}:{llm_gateway_port}", + "9901:19901", + ] port_mappings_args = [item for port in port_mappings for item in ("-p", port)] volume_mappings = [ diff --git a/arch/tools/poetry.lock b/arch/tools/poetry.lock index e04c6db3..0ccc45d3 100644 --- a/arch/tools/poetry.lock +++ b/arch/tools/poetry.lock @@ -318,6 +318,28 @@ files = [ {file = "distro-1.9.0.tar.gz", hash = "sha256:2fa77c6fd8940f116ee1d6b94a2f90b13b5ea8d019b98bc8bafdcabcdd9bdbed"}, ] +[[package]] +name = "docker" +version = "7.1.0" +description = "A Python library for the Docker Engine API." +optional = false +python-versions = ">=3.8" +files = [ + {file = "docker-7.1.0-py3-none-any.whl", hash = "sha256:c96b93b7f0a746f9e77d325bcfb87422a3d8bd4f03136ae8a85b37f1898d5fc0"}, + {file = "docker-7.1.0.tar.gz", hash = "sha256:ad8c70e6e3f8926cb8a92619b832b4ea5299e2831c14284663184e200546fa6c"}, +] + +[package.dependencies] +pywin32 = {version = ">=304", markers = "sys_platform == \"win32\""} +requests = ">=2.26.0" +urllib3 = ">=1.26.0" + +[package.extras] +dev = ["coverage (==7.2.7)", "pytest (==7.4.2)", "pytest-cov (==4.1.0)", "pytest-timeout (==2.1.0)", "ruff (==0.1.8)"] +docs = ["myst-parser (==0.18.0)", "sphinx (==5.1.1)"] +ssh = ["paramiko (>=2.4.3)"] +websockets = ["websocket-client (>=1.3.0)"] + [[package]] name = "exceptiongroup" version = "1.2.2" @@ -1602,6 +1624,20 @@ files = [ [package.dependencies] six = ">=1.5" +[[package]] +name = "python-dotenv" +version = "1.0.1" +description = "Read key-value pairs from a .env file and set them as environment variables" +optional = false +python-versions = ">=3.8" +files = [ + {file = "python-dotenv-1.0.1.tar.gz", hash = "sha256:e324ee90a023d808f1959c46bcbc04446a10ced277783dc6ee09987c37ec10ca"}, + {file = "python_dotenv-1.0.1-py3-none-any.whl", hash = "sha256:f7b63ef50f1b690dddf550d03497b66d609393b40b564ed0d674909a68ebf16a"}, +] + +[package.extras] +cli = ["click (>=5.0)"] + [[package]] name = "pytz" version = "2025.1" @@ -1613,6 +1649,33 @@ files = [ {file = "pytz-2025.1.tar.gz", hash = "sha256:c2db42be2a2518b28e65f9207c4d05e6ff547d1efa4086469ef855e4ab70178e"}, ] +[[package]] +name = "pywin32" +version = "308" +description = "Python for Window Extensions" +optional = false +python-versions = "*" +files = [ + {file = "pywin32-308-cp310-cp310-win32.whl", hash = "sha256:796ff4426437896550d2981b9c2ac0ffd75238ad9ea2d3bfa67a1abd546d262e"}, + {file = "pywin32-308-cp310-cp310-win_amd64.whl", hash = "sha256:4fc888c59b3c0bef905ce7eb7e2106a07712015ea1c8234b703a088d46110e8e"}, + {file = "pywin32-308-cp310-cp310-win_arm64.whl", hash = "sha256:a5ab5381813b40f264fa3495b98af850098f814a25a63589a8e9eb12560f450c"}, + {file = "pywin32-308-cp311-cp311-win32.whl", hash = "sha256:5d8c8015b24a7d6855b1550d8e660d8daa09983c80e5daf89a273e5c6fb5095a"}, + {file = "pywin32-308-cp311-cp311-win_amd64.whl", hash = "sha256:575621b90f0dc2695fec346b2d6302faebd4f0f45c05ea29404cefe35d89442b"}, + {file = "pywin32-308-cp311-cp311-win_arm64.whl", hash = "sha256:100a5442b7332070983c4cd03f2e906a5648a5104b8a7f50175f7906efd16bb6"}, + {file = "pywin32-308-cp312-cp312-win32.whl", hash = "sha256:587f3e19696f4bf96fde9d8a57cec74a57021ad5f204c9e627e15c33ff568897"}, + {file = "pywin32-308-cp312-cp312-win_amd64.whl", hash = "sha256:00b3e11ef09ede56c6a43c71f2d31857cf7c54b0ab6e78ac659497abd2834f47"}, + {file = "pywin32-308-cp312-cp312-win_arm64.whl", hash = "sha256:9b4de86c8d909aed15b7011182c8cab38c8850de36e6afb1f0db22b8959e3091"}, + {file = "pywin32-308-cp313-cp313-win32.whl", hash = "sha256:1c44539a37a5b7b21d02ab34e6a4d314e0788f1690d65b48e9b0b89f31abbbed"}, + {file = "pywin32-308-cp313-cp313-win_amd64.whl", hash = "sha256:fd380990e792eaf6827fcb7e187b2b4b1cede0585e3d0c9e84201ec27b9905e4"}, + {file = "pywin32-308-cp313-cp313-win_arm64.whl", hash = "sha256:ef313c46d4c18dfb82a2431e3051ac8f112ccee1a34f29c263c583c568db63cd"}, + {file = "pywin32-308-cp37-cp37m-win32.whl", hash = "sha256:1f696ab352a2ddd63bd07430080dd598e6369152ea13a25ebcdd2f503a38f1ff"}, + {file = "pywin32-308-cp37-cp37m-win_amd64.whl", hash = "sha256:13dcb914ed4347019fbec6697a01a0aec61019c1046c2b905410d197856326a6"}, + {file = "pywin32-308-cp38-cp38-win32.whl", hash = "sha256:5794e764ebcabf4ff08c555b31bd348c9025929371763b2183172ff4708152f0"}, + {file = "pywin32-308-cp38-cp38-win_amd64.whl", hash = "sha256:3b92622e29d651c6b783e368ba7d6722b1634b8e70bd376fd7610fe1992e19de"}, + {file = "pywin32-308-cp39-cp39-win32.whl", hash = "sha256:7873ca4dc60ab3287919881a7d4f88baee4a6e639aa6962de25a98ba6b193341"}, + {file = "pywin32-308-cp39-cp39-win_amd64.whl", hash = "sha256:71b3322d949b4cc20776436a9c9ba0eeedcbc9c650daa536df63f0ff111bb920"}, +] + [[package]] name = "pyyaml" version = "6.0.2" @@ -2481,4 +2544,4 @@ type = ["pytest-mypy"] [metadata] lock-version = "2.0" python-versions = "^3.10" -content-hash = "d4fb144073d4f8abcd8972892545d8ce47692d303ffa63dfe9b9bcdd0aea96f2" +content-hash = "d02e43f0884294d48736e1b8df248f47af480baffcbb7a0194da4e16cc1ea502" diff --git a/arch/tools/pyproject.toml b/arch/tools/pyproject.toml index c32ea929..8ce28cbc 100644 --- a/arch/tools/pyproject.toml +++ b/arch/tools/pyproject.toml @@ -15,6 +15,9 @@ click = "^8.1.7" jinja2 = "^3.1.4" jsonschema = "^4.23.0" setuptools = "75.5.0" +docker = "^7.1.0" +python-dotenv = "^1.0.1" +pyyaml = "^6.0.2" [tool.poetry.scripts] archgw = "cli.main:main" diff --git a/arch/validate_arch_config.sh b/arch/validate_arch_config.sh new file mode 100644 index 00000000..a3822e90 --- /dev/null +++ b/arch/validate_arch_config.sh @@ -0,0 +1,20 @@ +#!/bin/bash + +failed_files=() + +for file in $(find . -name arch_config.yaml -o -name arch_config_full_reference.yaml); do + echo "Validating $file..." + if ! docker run --rm -v "$(pwd)/$file:/app/arch_config.yaml:ro" --entrypoint /bin/sh katanemo/archgw:latest -c "python config_generator.py" 2>&1 > /dev/null ; then + echo "Validation failed for $file" + failed_files+=("$file") + fi +done + +# Print summary of failed files +if [ ${#failed_files[@]} -ne 0 ]; then + echo -e "\nValidation failed for the following files:" + printf '%s\n' "${failed_files[@]}" + exit 1 +else + echo -e "\nAll files validated successfully!" +fi diff --git a/crates/common/src/configuration.rs b/crates/common/src/configuration.rs index 069695ba..1b954c3d 100644 --- a/crates/common/src/configuration.rs +++ b/crates/common/src/configuration.rs @@ -9,7 +9,6 @@ use crate::api::open_ai::{ #[derive(Debug, Clone, Serialize, Deserialize)] pub struct Configuration { pub version: String, - pub listener: Listener, pub endpoints: Option>, pub llm_providers: Vec, pub overrides: Option, @@ -48,32 +47,6 @@ pub struct ErrorTargetDetail { pub endpoint: Option, } -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct Listener { - pub address: String, - pub port: u16, - pub message_format: MessageFormat, - // pub connect_timeout: Option, -} - -impl Default for Listener { - fn default() -> Self { - Listener { - address: "".to_string(), - port: 0, - message_format: MessageFormat::default(), - // connect_timeout: None, - } - } -} - -#[derive(Debug, Clone, Serialize, Deserialize, Default)] -pub enum MessageFormat { - #[serde(rename = "huggingface")] - #[default] - Huggingface, -} - #[derive(Debug, Clone, Serialize, Deserialize, Default)] pub struct PromptGuards { pub input_guards: HashMap, @@ -353,16 +326,6 @@ mod test { Some("/agent/summary".to_string()) ); - let error_target = config.error_target.as_ref().unwrap(); - assert_eq!( - error_target.endpoint.as_ref().unwrap().name, - "error_target_1".to_string() - ); - assert_eq!( - error_target.endpoint.as_ref().unwrap().path, - Some("/error".to_string()) - ); - let tracing = config.tracing.as_ref().unwrap(); assert_eq!(tracing.sampling_rate.unwrap(), 0.1); diff --git a/crates/common/src/consts.rs b/crates/common/src/consts.rs index 561dbae3..cd52220e 100644 --- a/crates/common/src/consts.rs +++ b/crates/common/src/consts.rs @@ -3,7 +3,10 @@ pub const SYSTEM_ROLE: &str = "system"; pub const USER_ROLE: &str = "user"; pub const TOOL_ROLE: &str = "tool"; pub const ASSISTANT_ROLE: &str = "assistant"; -pub const ARCH_FC_REQUEST_TIMEOUT_MS: u64 = 120000; // 2 minutes +pub const ARCH_FC_REQUEST_TIMEOUT_MS: u64 = 30000; // 30 seconds +pub const DEFAULT_TARGET_REQUEST_TIMEOUT_MS: u64 = 30000; // 30 seconds +pub const API_REQUEST_TIMEOUT_MS: u64 = 30000; // 30 seconds +pub const MODEL_SERVER_REQUEST_TIMEOUT_MS: u64 = 30000; // 30 seconds pub const MODEL_SERVER_NAME: &str = "model_server"; pub const ARCH_ROUTING_HEADER: &str = "x-arch-llm-provider"; pub const MESSAGES_KEY: &str = "messages"; diff --git a/crates/common/src/tracing.rs b/crates/common/src/tracing.rs index 85a95b0f..363a0870 100644 --- a/crates/common/src/tracing.rs +++ b/crates/common/src/tracing.rs @@ -166,7 +166,7 @@ impl TraceData { attributes: vec![Attribute { key: "service.name".to_string(), value: AttributeValue { - string_value: Some("upstream-llm".to_string()), + string_value: Some("egress_llm_traffic".to_string()), }, }], }; diff --git a/crates/llm_gateway/src/stream_context.rs b/crates/llm_gateway/src/stream_context.rs index 528358a3..69496a61 100644 --- a/crates/llm_gateway/src/stream_context.rs +++ b/crates/llm_gateway/src/stream_context.rs @@ -381,7 +381,7 @@ impl HttpContext for StreamContext { Ok(traceparent) => { let mut trace_data = common::tracing::TraceData::new(); let mut llm_span = Span::new( - "upstream_llm_time".to_string(), + "egress_traffic".to_string(), Some(traceparent.trace_id), Some(traceparent.parent_id), self.request_body_sent_time.unwrap(), diff --git a/crates/prompt_gateway/src/http_context.rs b/crates/prompt_gateway/src/http_context.rs index 1ff7f91d..53a2d25b 100644 --- a/crates/prompt_gateway/src/http_context.rs +++ b/crates/prompt_gateway/src/http_context.rs @@ -6,7 +6,8 @@ use common::{ consts::{ ARCH_FC_MODEL_NAME, ARCH_INTERNAL_CLUSTER_NAME, ARCH_STATE_HEADER, ARCH_UPSTREAM_HOST_HEADER, ASSISTANT_ROLE, CHAT_COMPLETIONS_PATH, HEALTHZ_PATH, - MODEL_SERVER_NAME, REQUEST_ID_HEADER, TOOL_ROLE, TRACE_PARENT_HEADER, USER_ROLE, + MODEL_SERVER_NAME, MODEL_SERVER_REQUEST_TIMEOUT_MS, REQUEST_ID_HEADER, TOOL_ROLE, + TRACE_PARENT_HEADER, USER_ROLE, }, errors::ServerError, http::{CallArgs, Client}, @@ -144,7 +145,10 @@ impl HttpContext for StreamContext { if metadata.is_none() { metadata = Some(HashMap::new()); } - metadata.as_mut().unwrap().insert("optimize_context_window".to_string(), "true".to_string()); + metadata + .as_mut() + .unwrap() + .insert("optimize_context_window".to_string(), "true".to_string()); } } @@ -170,12 +174,15 @@ impl HttpContext for StreamContext { debug!("sending request to model server"); trace!("request body: {}", json_data); + let timeout_str = MODEL_SERVER_REQUEST_TIMEOUT_MS.to_string(); + let mut headers = vec![ (ARCH_UPSTREAM_HOST_HEADER, MODEL_SERVER_NAME), (":method", "POST"), (":path", "/function_calling"), ("content-type", "application/json"), (":authority", MODEL_SERVER_NAME), + ("x-envoy-upstream-rq-timeout-ms", timeout_str.as_str()), ]; if self.request_id.is_some() { diff --git a/crates/prompt_gateway/src/stream_context.rs b/crates/prompt_gateway/src/stream_context.rs index e6db7f59..d197b3e0 100644 --- a/crates/prompt_gateway/src/stream_context.rs +++ b/crates/prompt_gateway/src/stream_context.rs @@ -6,9 +6,9 @@ use common::api::open_ai::{ }; use common::configuration::{Overrides, PromptTarget, Tracing}; use common::consts::{ - ARCH_FC_MODEL_NAME, ARCH_FC_REQUEST_TIMEOUT_MS, ARCH_INTERNAL_CLUSTER_NAME, - ARCH_UPSTREAM_HOST_HEADER, ASSISTANT_ROLE, MESSAGES_KEY, REQUEST_ID_HEADER, SYSTEM_ROLE, - TOOL_ROLE, TRACE_PARENT_HEADER, USER_ROLE, + API_REQUEST_TIMEOUT_MS, ARCH_FC_MODEL_NAME, ARCH_INTERNAL_CLUSTER_NAME, + ARCH_UPSTREAM_HOST_HEADER, ASSISTANT_ROLE, DEFAULT_TARGET_REQUEST_TIMEOUT_MS, MESSAGES_KEY, + REQUEST_ID_HEADER, SYSTEM_ROLE, TOOL_ROLE, TRACE_PARENT_HEADER, USER_ROLE, }; use common::errors::ServerError; use common::http::{CallArgs, Client}; @@ -89,7 +89,7 @@ impl StreamContext { streaming_response: false, user_prompt: None, is_chat_completions_request: false, - overrides: overrides, + overrides, request_id: None, traceparent: None, _tracing: tracing, @@ -160,7 +160,7 @@ impl StreamContext { callout_context.request_body.messages.clone(), ); let arch_messages_json = serde_json::to_string(¶ms).unwrap(); - let timeout_str = ARCH_FC_REQUEST_TIMEOUT_MS.to_string(); + let timeout_str = DEFAULT_TARGET_REQUEST_TIMEOUT_MS.to_string(); let mut headers = vec![ (":method", "POST"), @@ -302,6 +302,8 @@ impl StreamContext { } }; + let timeout_str = API_REQUEST_TIMEOUT_MS.to_string(); + let http_method_str = http_method.to_string(); let mut headers: HashMap<_, _> = [ (ARCH_UPSTREAM_HOST_HEADER, endpoint_details.name.as_str()), @@ -310,6 +312,7 @@ impl StreamContext { (":authority", endpoint_details.name.as_str()), ("content-type", "application/json"), ("x-envoy-max-retries", "3"), + ("x-envoy-upstream-rq-timeout-ms", timeout_str.as_str()), ] .into_iter() .collect(); diff --git a/crates/prompt_gateway/tests/integration.rs b/crates/prompt_gateway/tests/integration.rs index 4af98166..0792a319 100644 --- a/crates/prompt_gateway/tests/integration.rs +++ b/crates/prompt_gateway/tests/integration.rs @@ -81,10 +81,11 @@ fn normal_flow(module: &mut Tester, filter_context: i32, http_context: i32) { (":path", "/function_calling"), ("content-type", "application/json"), (":authority", "model_server"), + ("x-envoy-upstream-rq-timeout-ms", "30000"), ]), None, None, - None, + Some(5000), ) .returning(Some(1)) .expect_log(Some(LogLevel::Trace), None) @@ -387,10 +388,11 @@ fn prompt_gateway_request_to_llm_gateway() { (":authority", "api_server"), ("x-envoy-max-retries", "3"), (":path", "/weather"), + ("x-envoy-upstream-rq-timeout-ms", "30000"), ]), Some(expected_body), None, - None, + Some(5000), ) .returning(Some(2)) .expect_metric_increment("active_http_calls", 1) diff --git a/demos/samples_java/weather_forcecast_service/arch_config.yaml b/demos/samples_java/weather_forcecast_service/arch_config.yaml index 10c22819..0e3007e2 100644 --- a/demos/samples_java/weather_forcecast_service/arch_config.yaml +++ b/demos/samples_java/weather_forcecast_service/arch_config.yaml @@ -1,8 +1,10 @@ version: v0.1 -listener: - address: 127.0.0.1 - port: 10000 #If you configure port 443, you'll need to update the listener with tls_certificates - message_format: huggingface +listeners: + ingress_traffic: + address: 0.0.0.0 + port: 10000 + message_format: openai + timeout: 30s # Centralized way to manage LLMs, manage keys, retry logic, failover and limits in a central way llm_providers: diff --git a/demos/samples_python/currency_exchange/arch_config.yaml b/demos/samples_python/currency_exchange/arch_config.yaml index 89a9e65b..1475abca 100644 --- a/demos/samples_python/currency_exchange/arch_config.yaml +++ b/demos/samples_python/currency_exchange/arch_config.yaml @@ -1,10 +1,11 @@ version: v0.1 -listener: - address: 0.0.0.0 - port: 10000 - message_format: huggingface - connect_timeout: 0.005s +listeners: + ingress_traffic: + address: 0.0.0.0 + port: 10000 + message_format: openai + timeout: 30s llm_providers: - name: gpt-4o diff --git a/demos/samples_python/human_resources_agent/arch_config.yaml b/demos/samples_python/human_resources_agent/arch_config.yaml index 09264821..5b1a9aa2 100644 --- a/demos/samples_python/human_resources_agent/arch_config.yaml +++ b/demos/samples_python/human_resources_agent/arch_config.yaml @@ -1,8 +1,10 @@ version: v0.1 -listener: - address: 127.0.0.1 - port: 8080 #If you configure port 443, you'll need to update the listener with tls_certificates - message_format: huggingface +listeners: + ingress_traffic: + address: 0.0.0.0 + port: 10000 + message_format: openai + timeout: 30s # Centralized way to manage LLMs, manage keys, retry logic, failover and limits in a central way llm_providers: diff --git a/demos/samples_python/multi_turn_rag_agent/arch_config.yaml b/demos/samples_python/multi_turn_rag_agent/arch_config.yaml index 1399965f..8e8feb4f 100644 --- a/demos/samples_python/multi_turn_rag_agent/arch_config.yaml +++ b/demos/samples_python/multi_turn_rag_agent/arch_config.yaml @@ -1,10 +1,11 @@ version: v0.1 -listener: - address: 127.0.0.1 - port: 10000 - message_format: huggingface - connect_timeout: 0.005s +listeners: + ingress_traffic: + address: 0.0.0.0 + port: 10000 + message_format: openai + timeout: 30s endpoints: rag_energy_source_agent: diff --git a/demos/samples_python/network_switch_operator_agent/arch_config.yaml b/demos/samples_python/network_switch_operator_agent/arch_config.yaml index ad3bfae5..40d529a2 100644 --- a/demos/samples_python/network_switch_operator_agent/arch_config.yaml +++ b/demos/samples_python/network_switch_operator_agent/arch_config.yaml @@ -1,8 +1,10 @@ version: v0.1 -listener: - address: 127.0.0.1 - port: 8080 #If you configure port 443, you'll need to update the listener with tls_certificates - message_format: huggingface +listeners: + ingress_traffic: + address: 0.0.0.0 + port: 10000 + message_format: openai + timeout: 30s # Centralized way to manage LLMs, manage keys, retry logic, failover and limits in a central way llm_providers: diff --git a/demos/samples_python/stock_quote/arch_config.yaml b/demos/samples_python/stock_quote/arch_config.yaml index c763d4ca..96901620 100644 --- a/demos/samples_python/stock_quote/arch_config.yaml +++ b/demos/samples_python/stock_quote/arch_config.yaml @@ -1,10 +1,11 @@ version: v0.1 -listener: - address: 0.0.0.0 - port: 10000 - message_format: huggingface - connect_timeout: 0.005s +listeners: + ingress_traffic: + address: 0.0.0.0 + port: 10000 + message_format: openai + timeout: 30s llm_providers: - name: gpt-4o diff --git a/demos/samples_python/weather_forecast/arch_config.yaml b/demos/samples_python/weather_forecast/arch_config.yaml index 94a6bdfb..8b0f4ca0 100644 --- a/demos/samples_python/weather_forecast/arch_config.yaml +++ b/demos/samples_python/weather_forecast/arch_config.yaml @@ -1,10 +1,11 @@ version: "0.1-beta" -listener: - address: 0.0.0.0 - port: 10000 - message_format: huggingface - connect_timeout: 0.005s +listeners: + ingress_traffic: + address: 0.0.0.0 + port: 10000 + message_format: openai + timeout: 30s endpoints: weather_forecast_service: diff --git a/demos/use_cases/llm_routing/arch_config.yaml b/demos/use_cases/llm_routing/arch_config.yaml index f7ce78cd..e3238484 100644 --- a/demos/use_cases/llm_routing/arch_config.yaml +++ b/demos/use_cases/llm_routing/arch_config.yaml @@ -1,10 +1,11 @@ version: "0.1-beta" -listener: - address: 0.0.0.0 - port: 10000 - message_format: huggingface - connect_timeout: 0.005s +listeners: + egress_traffic: + address: 0.0.0.0 + port: 12000 + message_format: openai + timeout: 30s llm_providers: - name: gpt-4o-mini diff --git a/demos/use_cases/ollama/arch_config.yaml b/demos/use_cases/ollama/arch_config.yaml index 5cb77750..394d3d6c 100644 --- a/demos/use_cases/ollama/arch_config.yaml +++ b/demos/use_cases/ollama/arch_config.yaml @@ -1,10 +1,11 @@ version: v0.1 -listener: - address: 0.0.0.0 - port: 10000 - message_format: huggingface - connect_timeout: 0.005s +listeners: + ingress_traffic: + address: 0.0.0.0 + port: 10000 + message_format: openai + timeout: 30s llm_providers: diff --git a/demos/use_cases/spotify_bearer_auth/arch_config.yaml b/demos/use_cases/spotify_bearer_auth/arch_config.yaml index a259a539..1d82a426 100644 --- a/demos/use_cases/spotify_bearer_auth/arch_config.yaml +++ b/demos/use_cases/spotify_bearer_auth/arch_config.yaml @@ -1,8 +1,10 @@ version: v0.1 -listener: - address: 127.0.0.1 - port: 8080 #If you configure port 443, you'll need to update the listener with tls_certificates - message_format: huggingface +listeners: + ingress_traffic: + address: 0.0.0.0 + port: 10000 + message_format: openai + timeout: 30s overrides: optimize_context_window: true diff --git a/docs/source/concepts/includes/arch_config.yaml b/docs/source/concepts/includes/arch_config.yaml index c78f35f7..a7d0a289 100644 --- a/docs/source/concepts/includes/arch_config.yaml +++ b/docs/source/concepts/includes/arch_config.yaml @@ -1,10 +1,11 @@ version: v0.1 -listener: - address: 0.0.0.0 # or 127.0.0.1 - port: 10000 - # Defines how Arch should parse the content from application/json or text/pain Content-type in the http request - message_format: huggingface +listeners: + ingress_traffic: + address: 0.0.0.0 + port: 10000 + message_format: openai + timeout: 30s # Centralized way to manage LLMs, manage keys, retry logic, failover and limits in a central way llm_providers: @@ -13,7 +14,6 @@ llm_providers: access_key: $OPENAI_API_KEY model: gpt-4o default: true - stream: true # default system prompt used by all prompt targets system_prompt: You are a network assistant that just offers facts; not advice on manufacturers or purchasing decisions. @@ -52,11 +52,6 @@ prompt_targets: default: false enum: [true, false] -error_target: - endpoint: - name: error_target_1 - path: /error - # Arch creates a round-robin load balancing between different endpoints, managed via the cluster subsystem. endpoints: app_server: diff --git a/docs/source/get_started/quickstart.rst b/docs/source/get_started/quickstart.rst index 268bf45d..d73ef7ca 100644 --- a/docs/source/get_started/quickstart.rst +++ b/docs/source/get_started/quickstart.rst @@ -42,11 +42,12 @@ Create ``arch_config.yaml`` file with the following content: version: v0.1 - listener: - address: 0.0.0.0 - port: 10000 - message_format: huggingface - connect_timeout: 0.005s + listeners: + ingress_traffic: + address: 0.0.0.0 + port: 10000 + message_format: openai + timeout: 30s llm_providers: - name: gpt-4o @@ -144,22 +145,23 @@ Create ``arch_config.yaml`` file with the following content: version: v0.1 - listener: - address: 0.0.0.0 - port: 10000 - message_format: huggingface - connect_timeout: 0.005s + listeners: + egress_traffic: + address: 0.0.0.0 + port: 12000 + message_format: openai + timeout: 30s llm_providers: - name: gpt-4o access_key: $OPENAI_API_KEY - provider: openai + provider_interface: openai model: gpt-4o default: true - name: ministral-3b access_key: $MISTRAL_API_KEY - provider: mistral + provider_interface: openai model: ministral-3b-latest Step 2. Start arch gateway diff --git a/docs/source/guides/includes/arch_config.yaml b/docs/source/guides/includes/arch_config.yaml index 415c74aa..e86c6072 100644 --- a/docs/source/guides/includes/arch_config.yaml +++ b/docs/source/guides/includes/arch_config.yaml @@ -1,10 +1,11 @@ version: v0.1 -listener: - address: 0.0.0.0 # or 127.0.0.1 - port: 10000 - # Defines how Arch should parse the content from application/json or text/pain Content-type in the http request - message_format: huggingface +listeners: + ingress_traffic: + address: 0.0.0.0 + port: 10000 + message_format: openai + timeout: 30s # Centralized way to manage LLMs, manage keys, retry logic, failover and limits in a central way llm_providers: @@ -13,7 +14,6 @@ llm_providers: access_key: $OPENAI_API_KEY model: gpt-4o default: true - stream: true # default system prompt used by all prompt targets system_prompt: You are a network assistant that just offers facts; not advice on manufacturers or purchasing decisions. @@ -54,11 +54,6 @@ prompt_targets: default: false enum: [true, false] -error_target: - endpoint: - name: error_target_1 - path: /error - # Arch creates a round-robin load balancing between different endpoints, managed via the cluster subsystem. endpoints: app_server: diff --git a/docs/source/resources/includes/arch_config_full_reference.yaml b/docs/source/resources/includes/arch_config_full_reference.yaml index f21fc1f5..90bbef56 100644 --- a/docs/source/resources/includes/arch_config_full_reference.yaml +++ b/docs/source/resources/includes/arch_config_full_reference.yaml @@ -1,16 +1,16 @@ version: v0.1 -listener: - address: 0.0.0.0 # or 127.0.0.1 - port: 10000 - # Defines how Arch should parse the content from application/json or text/pain Content-type in the http request - message_format: huggingface - common_tls_context: # If you configure port 443, you'll need to update the listener with your TLS certificates - tls_certificates: - - certificate_chain: - filename: /etc/certs/cert.pem - private_key: - filename: /etc/certs/key.pem +listeners: + ingress_traffic: + address: 0.0.0.0 + port: 10000 + message_format: openai + timeout: 5s + egress_traffic: + address: 0.0.0.0 + port: 12000 + message_format: openai + timeout: 5s # Arch creates a round-robin load balancing between different endpoints, managed via the cluster subsystem. endpoints: @@ -35,15 +35,6 @@ llm_providers: access_key: $OPENAI_API_KEY model: gpt-4o default: true - stream: true - rate_limits: - selector: #optional headers, to add rate limiting based on http headers like JWT tokens or API keys - http_header: - name: Authorization - value: "" # Empty value means each separate value has a separate limit - limit: - tokens: 100000 # Tokens per unit - unit: minute - name: Mistral8x7b provider_interface: openai @@ -99,11 +90,6 @@ prompt_targets: default: false enum: [true, false] -error_target: - endpoint: - name: error_target_1 - path: /error - tracing: # sampling rate. Note by default Arch works on OpenTelemetry compatible tracing. sampling_rate: 0.1 diff --git a/tests/archgw/arch_config.yaml b/tests/archgw/arch_config.yaml index 2c3d85d5..d1990330 100644 --- a/tests/archgw/arch_config.yaml +++ b/tests/archgw/arch_config.yaml @@ -1,10 +1,11 @@ version: "0.1-beta" -listener: - address: 0.0.0.0 - port: 10000 - message_format: huggingface - connect_timeout: 0.005s +listeners: + ingress_traffic: + address: 0.0.0.0 + port: 10000 + message_format: openai + timeout: 30s endpoints: weather_forecast_service: