Update arch_config and add tests for arch config file (#407)

This commit is contained in:
Adil Hafeez 2025-02-14 19:28:10 -08:00 committed by GitHub
parent d0a783cca8
commit e40b13be05
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
31 changed files with 379 additions and 212 deletions

View file

@ -0,0 +1,31 @@
name: arch config tests
on:
push:
branches:
- main
pull_request:
jobs:
validate_arch_config:
runs-on: ubuntu-latest
defaults:
run:
working-directory: .
steps:
- name: Checkout code
uses: actions/checkout@v3
- name: Set up Python
uses: actions/setup-python@v4
with:
python-version: "3.12"
- name: build arch docker image
run: |
docker build -f arch/Dockerfile . -t katanemo/archgw
- name: validate arch config
run: |
bash arch/validate_arch_config.sh

View file

@ -3,21 +3,38 @@ type: object
properties:
version:
type: string
listener:
listeners:
type: object
properties:
address:
type: string
port:
type: integer
message_format:
type: string
connect_timeout:
type: string
additionalProperties: false
required:
- address
- port
properties:
ingress_traffic:
type: object
properties:
address:
type: string
port:
type: integer
message_format:
type: string
enum:
- openai
timeout:
type: string
additionalProperties: false
egress_traffic:
type: object
properties:
address:
type: string
port:
type: integer
message_format:
type: string
enum:
- openai
timeout:
type: string
additionalProperties: false
endpoints:
type: object
patternProperties:
@ -107,7 +124,10 @@ properties:
required:
type: boolean
default:
type: string
anyOf:
- type: string
- type: integer
- type: boolean
description:
type: string
type:
@ -115,7 +135,10 @@ properties:
enum:
type: array
items:
type: string
anyOf:
- type: string
- type: integer
- type: boolean
in_path:
type: boolean
format:
@ -224,5 +247,4 @@ properties:
additionalProperties: false
required:
- version
- listener
- llm_providers

View file

@ -29,11 +29,11 @@ stats_config:
- 180000
static_resources:
listeners:
- name: arch_listener_http
- name: ingress_traffic
address:
socket_address:
address: 0.0.0.0
port_value: 10000
address: {{ prompt_gateway_listener.address }}
port_value: {{ prompt_gateway_listener.port }}
traffic_direction: INBOUND
filter_chains:
- filters:
@ -55,7 +55,7 @@ static_resources:
random_sampling:
value: {{ arch_tracing.random_sampling }}
{% endif %}
stat_prefix: arch_listener_http
stat_prefix: ingress_traffic
codec_type: AUTO
scheme_header_transformation:
scheme_to_overwrite: https
@ -76,13 +76,13 @@ static_resources:
route:
auto_host_rewrite: true
cluster: arch_prompt_gateway_listener
timeout: 60s
timeout: {{ prompt_gateway_listener.timeout }}
http_filters:
- name: envoy.filters.http.router
typed_config:
"@type": type.googleapis.com/envoy.extensions.filters.http.router.v3.Router
- name: arch_prompt_gateway_listener
- name: ingress_traffic_prompt
address:
socket_address:
address: 0.0.0.0
@ -104,11 +104,11 @@ static_resources:
envoy_grpc:
cluster_name: opentelemetry_collector
timeout: 0.250s
service_name: prompt_processor
service_name: ingress_traffic
random_sampling:
value: {{ arch_tracing.random_sampling }}
{% endif %}
stat_prefix: arch_prompt_gateway_listener
stat_prefix: ingress_traffic
codec_type: AUTO
scheme_header_transformation:
scheme_to_overwrite: https
@ -201,7 +201,7 @@ static_resources:
typed_config:
"@type": type.googleapis.com/envoy.extensions.filters.http.router.v3.Router
- name: arch_internal
- name: egress_api_traffic
address:
socket_address:
address: 0.0.0.0
@ -223,11 +223,11 @@ static_resources:
envoy_grpc:
cluster_name: opentelemetry_collector
timeout: 0.250s
service_name: prompt_processor
service_name: egress_api_traffic
random_sampling:
value: {{ arch_tracing.random_sampling }}
{% endif %}
stat_prefix: arch_internal
stat_prefix: egress_api_traffic
codec_type: AUTO
scheme_header_transformation:
scheme_to_overwrite: https
@ -273,13 +273,12 @@ static_resources:
typed_config:
"@type": type.googleapis.com/envoy.extensions.filters.http.router.v3.Router
- name: arch_listener_http_llm
- name: egress_traffic
address:
socket_address:
address: 0.0.0.0
port_value: 12000
traffic_direction: INBOUND
address: {{ llm_gateway_listener.address }}
port_value: {{ llm_gateway_listener.port }}
traffic_direction: OUTBOUND
filter_chains:
- filters:
- name: envoy.filters.network.http_connection_manager
@ -300,7 +299,7 @@ static_resources:
random_sampling:
value: {{ arch_tracing.random_sampling }}
{% endif %}
stat_prefix: arch_listener_http
stat_prefix: egress_traffic
codec_type: AUTO
scheme_header_transformation:
scheme_to_overwrite: https
@ -321,14 +320,13 @@ static_resources:
route:
auto_host_rewrite: true
cluster: arch_listener_llm
timeout: 60s
timeout: {{ llm_gateway_listener.timeout }}
http_filters:
- name: envoy.filters.http.router
typed_config:
"@type": type.googleapis.com/envoy.extensions.filters.http.router.v3.Router
- name: arch_listener_llm
- name: egress_traffic_llm
address:
socket_address:
address: 0.0.0.0
@ -349,11 +347,11 @@ static_resources:
envoy_grpc:
cluster_name: opentelemetry_collector
timeout: 0.250s
service_name: llm_gateway
service_name: egress_traffic_llm
random_sampling:
value: {{ arch_tracing.random_sampling }}
{% endif %}
stat_prefix: arch_listener_http
stat_prefix: egress_traffic
codec_type: AUTO
scheme_header_transformation:
scheme_to_overwrite: https
@ -443,7 +441,7 @@ static_resources:
clusters:
- name: openai
connect_timeout: 5s
connect_timeout: 0.5s
type: LOGICAL_DNS
dns_lookup_family: V4_ONLY
lb_policy: ROUND_ROBIN
@ -467,7 +465,7 @@ static_resources:
tls_minimum_protocol_version: TLSv1_2
tls_maximum_protocol_version: TLSv1_3
- name: mistral
connect_timeout: 5s
connect_timeout: 0.5s
type: LOGICAL_DNS
dns_lookup_family: V4_ONLY
lb_policy: ROUND_ROBIN
@ -488,7 +486,7 @@ static_resources:
sni: api.mistral.ai
{% for internal_cluster in ["arch_fc", "model_server"] %}
- name: {{ internal_cluster }}
connect_timeout: 5s
connect_timeout: 0.5s
type: STRICT_DNS
dns_lookup_family: V4_ONLY
lb_policy: ROUND_ROBIN
@ -504,7 +502,7 @@ static_resources:
hostname: {{ internal_cluster }}
{% endfor %}
- name: mistral_7b_instruct
connect_timeout: 5s
connect_timeout: 0.5s
type: STRICT_DNS
dns_lookup_family: V4_ONLY
lb_policy: ROUND_ROBIN
@ -523,7 +521,7 @@ static_resources:
{% if cluster.connect_timeout -%}
connect_timeout: {{ cluster.connect_timeout }}
{% else -%}
connect_timeout: 5s
connect_timeout: 0.5s
{% endif -%}
type: LOGICAL_DNS
dns_lookup_family: V4_ONLY
@ -557,7 +555,7 @@ static_resources:
{% for local_llm_provider in local_llms %}
- name: {{ local_llm_provider.name }}
connect_timeout: 5s
connect_timeout: 0.5s
type: LOGICAL_DNS
dns_lookup_family: V4_ONLY
lb_policy: ROUND_ROBIN
@ -589,7 +587,7 @@ static_resources:
{% endfor %}
- name: arch_internal
connect_timeout: 5s
connect_timeout: 0.5s
type: LOGICAL_DNS
dns_lookup_family: V4_ONLY
lb_policy: ROUND_ROBIN
@ -605,7 +603,7 @@ static_resources:
hostname: arch_internal
- name: arch_prompt_gateway_listener
connect_timeout: 5s
connect_timeout: 0.5s
type: LOGICAL_DNS
dns_lookup_family: V4_ONLY
lb_policy: ROUND_ROBIN
@ -621,7 +619,7 @@ static_resources:
hostname: arch_prompt_gateway_listener
- name: arch_listener_llm
connect_timeout: 5s
connect_timeout: 0.5s
type: LOGICAL_DNS
dns_lookup_family: V4_ONLY
lb_policy: ROUND_ROBIN

View file

@ -104,7 +104,27 @@ def validate_and_render_schema():
arch_config_string = yaml.dump(config_yaml)
arch_llm_config_string = yaml.dump(config_yaml)
prompt_gateway_listener = config_yaml.get("listeners", {}).get(
"ingress_traffic", {}
)
if prompt_gateway_listener.get("port") == None:
prompt_gateway_listener["port"] = 10000 # default port for prompt gateway
if prompt_gateway_listener.get("address") == None:
prompt_gateway_listener["address"] = "127.0.0.1"
if prompt_gateway_listener.get("timeout") == None:
prompt_gateway_listener["timeout"] = "10s"
llm_gateway_listener = config_yaml.get("listeners", {}).get("egress_traffic", {})
if llm_gateway_listener.get("port") == None:
llm_gateway_listener["port"] = 12000 # default port for llm gateway
if llm_gateway_listener.get("address") == None:
llm_gateway_listener["address"] = "127.0.0.1"
if llm_gateway_listener.get("timeout") == None:
llm_gateway_listener["timeout"] = "10s"
data = {
"prompt_gateway_listener": prompt_gateway_listener,
"llm_gateway_listener": llm_gateway_listener,
"arch_config": arch_config_string,
"arch_llm_config": arch_llm_config_string,
"arch_clusters": inferred_clusters,

View file

@ -2,6 +2,8 @@ import subprocess
import os
import time
import sys
import yaml
from cli.utils import getLogger
from cli.consts import (
ARCHGW_DOCKER_NAME,
@ -22,6 +24,29 @@ from cli.docker_cli import (
log = getLogger(__name__)
def _get_gateway_ports(arch_config_file: str) -> tuple:
PROMPT_GATEWAY_DEFAULT_PORT = 10000
LLM_GATEWAY_DEFAULT_PORT = 12000
# parse arch_config_file yaml file and get prompt_gateway_port
arch_config_dict = {}
with open(arch_config_file) as f:
arch_config_dict = yaml.safe_load(f)
prompt_gateway_port = (
arch_config_dict.get("listeners", {})
.get("ingress_traffic", {})
.get("port", PROMPT_GATEWAY_DEFAULT_PORT)
)
llm_gateway_port = (
arch_config_dict.get("listeners", {})
.get("egress_traffic", {})
.get("port", LLM_GATEWAY_DEFAULT_PORT)
)
return prompt_gateway_port, llm_gateway_port
def start_arch(arch_config_file, env, log_timeout=120, foreground=False):
"""
Start Docker Compose in detached mode and stream logs until services are healthy.
@ -39,8 +64,14 @@ def start_arch(arch_config_file, env, log_timeout=120, foreground=False):
docker_stop_container(ARCHGW_DOCKER_NAME)
docker_remove_container(ARCHGW_DOCKER_NAME)
prompt_gateway_port, llm_gateway_port = _get_gateway_ports(arch_config_file)
return_code, _, archgw_stderr = docker_start_archgw_detached(
arch_config_file, os.path.expanduser("~/archgw_logs"), env
arch_config_file,
os.path.expanduser("~/archgw_logs"),
env,
prompt_gateway_port,
llm_gateway_port,
)
if return_code != 0:
log.info("Failed to start arch gateway: " + str(return_code))
@ -50,7 +81,7 @@ def start_arch(arch_config_file, env, log_timeout=120, foreground=False):
start_time = time.time()
while True:
health_check_status = health_check_endpoint(
"http://localhost:10000/healthz"
f"http://localhost:{prompt_gateway_port}/healthz"
)
archgw_status = docker_container_status(ARCHGW_DOCKER_NAME)
current_time = time.time()

View file

@ -1,7 +1,7 @@
import subprocess
import json
import sys
import requests # Add this import
import requests
from cli.consts import ARCHGW_DOCKER_IMAGE, ARCHGW_DOCKER_NAME
from cli.utils import getLogger
@ -33,11 +33,19 @@ def docker_remove_container(container: str) -> str:
def docker_start_archgw_detached(
arch_config_file: str, logs_path_abs: str, env: dict
arch_config_file: str,
logs_path_abs: str,
env: dict,
prompt_gateway_port,
llm_gateway_port,
) -> str:
env_args = [item for key, value in env.items() for item in ["-e", f"{key}={value}"]]
port_mappings = ["10000:10000", "12000:12000", "9901:19901"]
port_mappings = [
f"{prompt_gateway_port}:{prompt_gateway_port}",
f"{llm_gateway_port}:{llm_gateway_port}",
"9901:19901",
]
port_mappings_args = [item for port in port_mappings for item in ("-p", port)]
volume_mappings = [

65
arch/tools/poetry.lock generated
View file

@ -318,6 +318,28 @@ files = [
{file = "distro-1.9.0.tar.gz", hash = "sha256:2fa77c6fd8940f116ee1d6b94a2f90b13b5ea8d019b98bc8bafdcabcdd9bdbed"},
]
[[package]]
name = "docker"
version = "7.1.0"
description = "A Python library for the Docker Engine API."
optional = false
python-versions = ">=3.8"
files = [
{file = "docker-7.1.0-py3-none-any.whl", hash = "sha256:c96b93b7f0a746f9e77d325bcfb87422a3d8bd4f03136ae8a85b37f1898d5fc0"},
{file = "docker-7.1.0.tar.gz", hash = "sha256:ad8c70e6e3f8926cb8a92619b832b4ea5299e2831c14284663184e200546fa6c"},
]
[package.dependencies]
pywin32 = {version = ">=304", markers = "sys_platform == \"win32\""}
requests = ">=2.26.0"
urllib3 = ">=1.26.0"
[package.extras]
dev = ["coverage (==7.2.7)", "pytest (==7.4.2)", "pytest-cov (==4.1.0)", "pytest-timeout (==2.1.0)", "ruff (==0.1.8)"]
docs = ["myst-parser (==0.18.0)", "sphinx (==5.1.1)"]
ssh = ["paramiko (>=2.4.3)"]
websockets = ["websocket-client (>=1.3.0)"]
[[package]]
name = "exceptiongroup"
version = "1.2.2"
@ -1602,6 +1624,20 @@ files = [
[package.dependencies]
six = ">=1.5"
[[package]]
name = "python-dotenv"
version = "1.0.1"
description = "Read key-value pairs from a .env file and set them as environment variables"
optional = false
python-versions = ">=3.8"
files = [
{file = "python-dotenv-1.0.1.tar.gz", hash = "sha256:e324ee90a023d808f1959c46bcbc04446a10ced277783dc6ee09987c37ec10ca"},
{file = "python_dotenv-1.0.1-py3-none-any.whl", hash = "sha256:f7b63ef50f1b690dddf550d03497b66d609393b40b564ed0d674909a68ebf16a"},
]
[package.extras]
cli = ["click (>=5.0)"]
[[package]]
name = "pytz"
version = "2025.1"
@ -1613,6 +1649,33 @@ files = [
{file = "pytz-2025.1.tar.gz", hash = "sha256:c2db42be2a2518b28e65f9207c4d05e6ff547d1efa4086469ef855e4ab70178e"},
]
[[package]]
name = "pywin32"
version = "308"
description = "Python for Window Extensions"
optional = false
python-versions = "*"
files = [
{file = "pywin32-308-cp310-cp310-win32.whl", hash = "sha256:796ff4426437896550d2981b9c2ac0ffd75238ad9ea2d3bfa67a1abd546d262e"},
{file = "pywin32-308-cp310-cp310-win_amd64.whl", hash = "sha256:4fc888c59b3c0bef905ce7eb7e2106a07712015ea1c8234b703a088d46110e8e"},
{file = "pywin32-308-cp310-cp310-win_arm64.whl", hash = "sha256:a5ab5381813b40f264fa3495b98af850098f814a25a63589a8e9eb12560f450c"},
{file = "pywin32-308-cp311-cp311-win32.whl", hash = "sha256:5d8c8015b24a7d6855b1550d8e660d8daa09983c80e5daf89a273e5c6fb5095a"},
{file = "pywin32-308-cp311-cp311-win_amd64.whl", hash = "sha256:575621b90f0dc2695fec346b2d6302faebd4f0f45c05ea29404cefe35d89442b"},
{file = "pywin32-308-cp311-cp311-win_arm64.whl", hash = "sha256:100a5442b7332070983c4cd03f2e906a5648a5104b8a7f50175f7906efd16bb6"},
{file = "pywin32-308-cp312-cp312-win32.whl", hash = "sha256:587f3e19696f4bf96fde9d8a57cec74a57021ad5f204c9e627e15c33ff568897"},
{file = "pywin32-308-cp312-cp312-win_amd64.whl", hash = "sha256:00b3e11ef09ede56c6a43c71f2d31857cf7c54b0ab6e78ac659497abd2834f47"},
{file = "pywin32-308-cp312-cp312-win_arm64.whl", hash = "sha256:9b4de86c8d909aed15b7011182c8cab38c8850de36e6afb1f0db22b8959e3091"},
{file = "pywin32-308-cp313-cp313-win32.whl", hash = "sha256:1c44539a37a5b7b21d02ab34e6a4d314e0788f1690d65b48e9b0b89f31abbbed"},
{file = "pywin32-308-cp313-cp313-win_amd64.whl", hash = "sha256:fd380990e792eaf6827fcb7e187b2b4b1cede0585e3d0c9e84201ec27b9905e4"},
{file = "pywin32-308-cp313-cp313-win_arm64.whl", hash = "sha256:ef313c46d4c18dfb82a2431e3051ac8f112ccee1a34f29c263c583c568db63cd"},
{file = "pywin32-308-cp37-cp37m-win32.whl", hash = "sha256:1f696ab352a2ddd63bd07430080dd598e6369152ea13a25ebcdd2f503a38f1ff"},
{file = "pywin32-308-cp37-cp37m-win_amd64.whl", hash = "sha256:13dcb914ed4347019fbec6697a01a0aec61019c1046c2b905410d197856326a6"},
{file = "pywin32-308-cp38-cp38-win32.whl", hash = "sha256:5794e764ebcabf4ff08c555b31bd348c9025929371763b2183172ff4708152f0"},
{file = "pywin32-308-cp38-cp38-win_amd64.whl", hash = "sha256:3b92622e29d651c6b783e368ba7d6722b1634b8e70bd376fd7610fe1992e19de"},
{file = "pywin32-308-cp39-cp39-win32.whl", hash = "sha256:7873ca4dc60ab3287919881a7d4f88baee4a6e639aa6962de25a98ba6b193341"},
{file = "pywin32-308-cp39-cp39-win_amd64.whl", hash = "sha256:71b3322d949b4cc20776436a9c9ba0eeedcbc9c650daa536df63f0ff111bb920"},
]
[[package]]
name = "pyyaml"
version = "6.0.2"
@ -2481,4 +2544,4 @@ type = ["pytest-mypy"]
[metadata]
lock-version = "2.0"
python-versions = "^3.10"
content-hash = "d4fb144073d4f8abcd8972892545d8ce47692d303ffa63dfe9b9bcdd0aea96f2"
content-hash = "d02e43f0884294d48736e1b8df248f47af480baffcbb7a0194da4e16cc1ea502"

View file

@ -15,6 +15,9 @@ click = "^8.1.7"
jinja2 = "^3.1.4"
jsonschema = "^4.23.0"
setuptools = "75.5.0"
docker = "^7.1.0"
python-dotenv = "^1.0.1"
pyyaml = "^6.0.2"
[tool.poetry.scripts]
archgw = "cli.main:main"

View file

@ -0,0 +1,20 @@
#!/bin/bash
failed_files=()
for file in $(find . -name arch_config.yaml -o -name arch_config_full_reference.yaml); do
echo "Validating $file..."
if ! docker run --rm -v "$(pwd)/$file:/app/arch_config.yaml:ro" --entrypoint /bin/sh katanemo/archgw:latest -c "python config_generator.py" 2>&1 > /dev/null ; then
echo "Validation failed for $file"
failed_files+=("$file")
fi
done
# Print summary of failed files
if [ ${#failed_files[@]} -ne 0 ]; then
echo -e "\nValidation failed for the following files:"
printf '%s\n' "${failed_files[@]}"
exit 1
else
echo -e "\nAll files validated successfully!"
fi

View file

@ -9,7 +9,6 @@ use crate::api::open_ai::{
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Configuration {
pub version: String,
pub listener: Listener,
pub endpoints: Option<HashMap<String, Endpoint>>,
pub llm_providers: Vec<LlmProvider>,
pub overrides: Option<Overrides>,
@ -48,32 +47,6 @@ pub struct ErrorTargetDetail {
pub endpoint: Option<EndpointDetails>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Listener {
pub address: String,
pub port: u16,
pub message_format: MessageFormat,
// pub connect_timeout: Option<DurationString>,
}
impl Default for Listener {
fn default() -> Self {
Listener {
address: "".to_string(),
port: 0,
message_format: MessageFormat::default(),
// connect_timeout: None,
}
}
}
#[derive(Debug, Clone, Serialize, Deserialize, Default)]
pub enum MessageFormat {
#[serde(rename = "huggingface")]
#[default]
Huggingface,
}
#[derive(Debug, Clone, Serialize, Deserialize, Default)]
pub struct PromptGuards {
pub input_guards: HashMap<GuardType, GuardOptions>,
@ -353,16 +326,6 @@ mod test {
Some("/agent/summary".to_string())
);
let error_target = config.error_target.as_ref().unwrap();
assert_eq!(
error_target.endpoint.as_ref().unwrap().name,
"error_target_1".to_string()
);
assert_eq!(
error_target.endpoint.as_ref().unwrap().path,
Some("/error".to_string())
);
let tracing = config.tracing.as_ref().unwrap();
assert_eq!(tracing.sampling_rate.unwrap(), 0.1);

View file

@ -3,7 +3,10 @@ pub const SYSTEM_ROLE: &str = "system";
pub const USER_ROLE: &str = "user";
pub const TOOL_ROLE: &str = "tool";
pub const ASSISTANT_ROLE: &str = "assistant";
pub const ARCH_FC_REQUEST_TIMEOUT_MS: u64 = 120000; // 2 minutes
pub const ARCH_FC_REQUEST_TIMEOUT_MS: u64 = 30000; // 30 seconds
pub const DEFAULT_TARGET_REQUEST_TIMEOUT_MS: u64 = 30000; // 30 seconds
pub const API_REQUEST_TIMEOUT_MS: u64 = 30000; // 30 seconds
pub const MODEL_SERVER_REQUEST_TIMEOUT_MS: u64 = 30000; // 30 seconds
pub const MODEL_SERVER_NAME: &str = "model_server";
pub const ARCH_ROUTING_HEADER: &str = "x-arch-llm-provider";
pub const MESSAGES_KEY: &str = "messages";

View file

@ -166,7 +166,7 @@ impl TraceData {
attributes: vec![Attribute {
key: "service.name".to_string(),
value: AttributeValue {
string_value: Some("upstream-llm".to_string()),
string_value: Some("egress_llm_traffic".to_string()),
},
}],
};

View file

@ -381,7 +381,7 @@ impl HttpContext for StreamContext {
Ok(traceparent) => {
let mut trace_data = common::tracing::TraceData::new();
let mut llm_span = Span::new(
"upstream_llm_time".to_string(),
"egress_traffic".to_string(),
Some(traceparent.trace_id),
Some(traceparent.parent_id),
self.request_body_sent_time.unwrap(),

View file

@ -6,7 +6,8 @@ use common::{
consts::{
ARCH_FC_MODEL_NAME, ARCH_INTERNAL_CLUSTER_NAME, ARCH_STATE_HEADER,
ARCH_UPSTREAM_HOST_HEADER, ASSISTANT_ROLE, CHAT_COMPLETIONS_PATH, HEALTHZ_PATH,
MODEL_SERVER_NAME, REQUEST_ID_HEADER, TOOL_ROLE, TRACE_PARENT_HEADER, USER_ROLE,
MODEL_SERVER_NAME, MODEL_SERVER_REQUEST_TIMEOUT_MS, REQUEST_ID_HEADER, TOOL_ROLE,
TRACE_PARENT_HEADER, USER_ROLE,
},
errors::ServerError,
http::{CallArgs, Client},
@ -144,7 +145,10 @@ impl HttpContext for StreamContext {
if metadata.is_none() {
metadata = Some(HashMap::new());
}
metadata.as_mut().unwrap().insert("optimize_context_window".to_string(), "true".to_string());
metadata
.as_mut()
.unwrap()
.insert("optimize_context_window".to_string(), "true".to_string());
}
}
@ -170,12 +174,15 @@ impl HttpContext for StreamContext {
debug!("sending request to model server");
trace!("request body: {}", json_data);
let timeout_str = MODEL_SERVER_REQUEST_TIMEOUT_MS.to_string();
let mut headers = vec![
(ARCH_UPSTREAM_HOST_HEADER, MODEL_SERVER_NAME),
(":method", "POST"),
(":path", "/function_calling"),
("content-type", "application/json"),
(":authority", MODEL_SERVER_NAME),
("x-envoy-upstream-rq-timeout-ms", timeout_str.as_str()),
];
if self.request_id.is_some() {

View file

@ -6,9 +6,9 @@ use common::api::open_ai::{
};
use common::configuration::{Overrides, PromptTarget, Tracing};
use common::consts::{
ARCH_FC_MODEL_NAME, ARCH_FC_REQUEST_TIMEOUT_MS, ARCH_INTERNAL_CLUSTER_NAME,
ARCH_UPSTREAM_HOST_HEADER, ASSISTANT_ROLE, MESSAGES_KEY, REQUEST_ID_HEADER, SYSTEM_ROLE,
TOOL_ROLE, TRACE_PARENT_HEADER, USER_ROLE,
API_REQUEST_TIMEOUT_MS, ARCH_FC_MODEL_NAME, ARCH_INTERNAL_CLUSTER_NAME,
ARCH_UPSTREAM_HOST_HEADER, ASSISTANT_ROLE, DEFAULT_TARGET_REQUEST_TIMEOUT_MS, MESSAGES_KEY,
REQUEST_ID_HEADER, SYSTEM_ROLE, TOOL_ROLE, TRACE_PARENT_HEADER, USER_ROLE,
};
use common::errors::ServerError;
use common::http::{CallArgs, Client};
@ -89,7 +89,7 @@ impl StreamContext {
streaming_response: false,
user_prompt: None,
is_chat_completions_request: false,
overrides: overrides,
overrides,
request_id: None,
traceparent: None,
_tracing: tracing,
@ -160,7 +160,7 @@ impl StreamContext {
callout_context.request_body.messages.clone(),
);
let arch_messages_json = serde_json::to_string(&params).unwrap();
let timeout_str = ARCH_FC_REQUEST_TIMEOUT_MS.to_string();
let timeout_str = DEFAULT_TARGET_REQUEST_TIMEOUT_MS.to_string();
let mut headers = vec![
(":method", "POST"),
@ -302,6 +302,8 @@ impl StreamContext {
}
};
let timeout_str = API_REQUEST_TIMEOUT_MS.to_string();
let http_method_str = http_method.to_string();
let mut headers: HashMap<_, _> = [
(ARCH_UPSTREAM_HOST_HEADER, endpoint_details.name.as_str()),
@ -310,6 +312,7 @@ impl StreamContext {
(":authority", endpoint_details.name.as_str()),
("content-type", "application/json"),
("x-envoy-max-retries", "3"),
("x-envoy-upstream-rq-timeout-ms", timeout_str.as_str()),
]
.into_iter()
.collect();

View file

@ -81,10 +81,11 @@ fn normal_flow(module: &mut Tester, filter_context: i32, http_context: i32) {
(":path", "/function_calling"),
("content-type", "application/json"),
(":authority", "model_server"),
("x-envoy-upstream-rq-timeout-ms", "30000"),
]),
None,
None,
None,
Some(5000),
)
.returning(Some(1))
.expect_log(Some(LogLevel::Trace), None)
@ -387,10 +388,11 @@ fn prompt_gateway_request_to_llm_gateway() {
(":authority", "api_server"),
("x-envoy-max-retries", "3"),
(":path", "/weather"),
("x-envoy-upstream-rq-timeout-ms", "30000"),
]),
Some(expected_body),
None,
None,
Some(5000),
)
.returning(Some(2))
.expect_metric_increment("active_http_calls", 1)

View file

@ -1,8 +1,10 @@
version: v0.1
listener:
address: 127.0.0.1
port: 10000 #If you configure port 443, you'll need to update the listener with tls_certificates
message_format: huggingface
listeners:
ingress_traffic:
address: 0.0.0.0
port: 10000
message_format: openai
timeout: 30s
# Centralized way to manage LLMs, manage keys, retry logic, failover and limits in a central way
llm_providers:

View file

@ -1,10 +1,11 @@
version: v0.1
listener:
address: 0.0.0.0
port: 10000
message_format: huggingface
connect_timeout: 0.005s
listeners:
ingress_traffic:
address: 0.0.0.0
port: 10000
message_format: openai
timeout: 30s
llm_providers:
- name: gpt-4o

View file

@ -1,8 +1,10 @@
version: v0.1
listener:
address: 127.0.0.1
port: 8080 #If you configure port 443, you'll need to update the listener with tls_certificates
message_format: huggingface
listeners:
ingress_traffic:
address: 0.0.0.0
port: 10000
message_format: openai
timeout: 30s
# Centralized way to manage LLMs, manage keys, retry logic, failover and limits in a central way
llm_providers:

View file

@ -1,10 +1,11 @@
version: v0.1
listener:
address: 127.0.0.1
port: 10000
message_format: huggingface
connect_timeout: 0.005s
listeners:
ingress_traffic:
address: 0.0.0.0
port: 10000
message_format: openai
timeout: 30s
endpoints:
rag_energy_source_agent:

View file

@ -1,8 +1,10 @@
version: v0.1
listener:
address: 127.0.0.1
port: 8080 #If you configure port 443, you'll need to update the listener with tls_certificates
message_format: huggingface
listeners:
ingress_traffic:
address: 0.0.0.0
port: 10000
message_format: openai
timeout: 30s
# Centralized way to manage LLMs, manage keys, retry logic, failover and limits in a central way
llm_providers:

View file

@ -1,10 +1,11 @@
version: v0.1
listener:
address: 0.0.0.0
port: 10000
message_format: huggingface
connect_timeout: 0.005s
listeners:
ingress_traffic:
address: 0.0.0.0
port: 10000
message_format: openai
timeout: 30s
llm_providers:
- name: gpt-4o

View file

@ -1,10 +1,11 @@
version: "0.1-beta"
listener:
address: 0.0.0.0
port: 10000
message_format: huggingface
connect_timeout: 0.005s
listeners:
ingress_traffic:
address: 0.0.0.0
port: 10000
message_format: openai
timeout: 30s
endpoints:
weather_forecast_service:

View file

@ -1,10 +1,11 @@
version: "0.1-beta"
listener:
address: 0.0.0.0
port: 10000
message_format: huggingface
connect_timeout: 0.005s
listeners:
egress_traffic:
address: 0.0.0.0
port: 12000
message_format: openai
timeout: 30s
llm_providers:
- name: gpt-4o-mini

View file

@ -1,10 +1,11 @@
version: v0.1
listener:
address: 0.0.0.0
port: 10000
message_format: huggingface
connect_timeout: 0.005s
listeners:
ingress_traffic:
address: 0.0.0.0
port: 10000
message_format: openai
timeout: 30s
llm_providers:

View file

@ -1,8 +1,10 @@
version: v0.1
listener:
address: 127.0.0.1
port: 8080 #If you configure port 443, you'll need to update the listener with tls_certificates
message_format: huggingface
listeners:
ingress_traffic:
address: 0.0.0.0
port: 10000
message_format: openai
timeout: 30s
overrides:
optimize_context_window: true

View file

@ -1,10 +1,11 @@
version: v0.1
listener:
address: 0.0.0.0 # or 127.0.0.1
port: 10000
# Defines how Arch should parse the content from application/json or text/pain Content-type in the http request
message_format: huggingface
listeners:
ingress_traffic:
address: 0.0.0.0
port: 10000
message_format: openai
timeout: 30s
# Centralized way to manage LLMs, manage keys, retry logic, failover and limits in a central way
llm_providers:
@ -13,7 +14,6 @@ llm_providers:
access_key: $OPENAI_API_KEY
model: gpt-4o
default: true
stream: true
# default system prompt used by all prompt targets
system_prompt: You are a network assistant that just offers facts; not advice on manufacturers or purchasing decisions.
@ -52,11 +52,6 @@ prompt_targets:
default: false
enum: [true, false]
error_target:
endpoint:
name: error_target_1
path: /error
# Arch creates a round-robin load balancing between different endpoints, managed via the cluster subsystem.
endpoints:
app_server:

View file

@ -42,11 +42,12 @@ Create ``arch_config.yaml`` file with the following content:
version: v0.1
listener:
address: 0.0.0.0
port: 10000
message_format: huggingface
connect_timeout: 0.005s
listeners:
ingress_traffic:
address: 0.0.0.0
port: 10000
message_format: openai
timeout: 30s
llm_providers:
- name: gpt-4o
@ -144,22 +145,23 @@ Create ``arch_config.yaml`` file with the following content:
version: v0.1
listener:
address: 0.0.0.0
port: 10000
message_format: huggingface
connect_timeout: 0.005s
listeners:
egress_traffic:
address: 0.0.0.0
port: 12000
message_format: openai
timeout: 30s
llm_providers:
- name: gpt-4o
access_key: $OPENAI_API_KEY
provider: openai
provider_interface: openai
model: gpt-4o
default: true
- name: ministral-3b
access_key: $MISTRAL_API_KEY
provider: mistral
provider_interface: openai
model: ministral-3b-latest
Step 2. Start arch gateway

View file

@ -1,10 +1,11 @@
version: v0.1
listener:
address: 0.0.0.0 # or 127.0.0.1
port: 10000
# Defines how Arch should parse the content from application/json or text/pain Content-type in the http request
message_format: huggingface
listeners:
ingress_traffic:
address: 0.0.0.0
port: 10000
message_format: openai
timeout: 30s
# Centralized way to manage LLMs, manage keys, retry logic, failover and limits in a central way
llm_providers:
@ -13,7 +14,6 @@ llm_providers:
access_key: $OPENAI_API_KEY
model: gpt-4o
default: true
stream: true
# default system prompt used by all prompt targets
system_prompt: You are a network assistant that just offers facts; not advice on manufacturers or purchasing decisions.
@ -54,11 +54,6 @@ prompt_targets:
default: false
enum: [true, false]
error_target:
endpoint:
name: error_target_1
path: /error
# Arch creates a round-robin load balancing between different endpoints, managed via the cluster subsystem.
endpoints:
app_server:

View file

@ -1,16 +1,16 @@
version: v0.1
listener:
address: 0.0.0.0 # or 127.0.0.1
port: 10000
# Defines how Arch should parse the content from application/json or text/pain Content-type in the http request
message_format: huggingface
common_tls_context: # If you configure port 443, you'll need to update the listener with your TLS certificates
tls_certificates:
- certificate_chain:
filename: /etc/certs/cert.pem
private_key:
filename: /etc/certs/key.pem
listeners:
ingress_traffic:
address: 0.0.0.0
port: 10000
message_format: openai
timeout: 5s
egress_traffic:
address: 0.0.0.0
port: 12000
message_format: openai
timeout: 5s
# Arch creates a round-robin load balancing between different endpoints, managed via the cluster subsystem.
endpoints:
@ -35,15 +35,6 @@ llm_providers:
access_key: $OPENAI_API_KEY
model: gpt-4o
default: true
stream: true
rate_limits:
selector: #optional headers, to add rate limiting based on http headers like JWT tokens or API keys
http_header:
name: Authorization
value: "" # Empty value means each separate value has a separate limit
limit:
tokens: 100000 # Tokens per unit
unit: minute
- name: Mistral8x7b
provider_interface: openai
@ -99,11 +90,6 @@ prompt_targets:
default: false
enum: [true, false]
error_target:
endpoint:
name: error_target_1
path: /error
tracing:
# sampling rate. Note by default Arch works on OpenTelemetry compatible tracing.
sampling_rate: 0.1

View file

@ -1,10 +1,11 @@
version: "0.1-beta"
listener:
address: 0.0.0.0
port: 10000
message_format: huggingface
connect_timeout: 0.005s
listeners:
ingress_traffic:
address: 0.0.0.0
port: 10000
message_format: openai
timeout: 30s
endpoints:
weather_forecast_service: