admin: address: socket_address: { address: 0.0.0.0, port_value: 9901 } stats_config: histogram_bucket_settings: match: exact: "wasmcustom.time_to_first_token" buckets: - 100 - 500 - 800 - 1000 - 1200 - 1400 - 1600 - 1800 - 2000 - 2200 - 2400 - 3000 - 3500 - 4000 - 4500 - 5000 - 6000 - 10000 - 60000 - 180000 static_resources: listeners: - name: arch_listener_http address: socket_address: address: 0.0.0.0 port_value: 10000 traffic_direction: INBOUND filter_chains: - filters: - name: envoy.filters.network.http_connection_manager typed_config: "@type": type.googleapis.com/envoy.extensions.filters.network.http_connection_manager.v3.HttpConnectionManager {% if "random_sampling" in arch_tracing and arch_tracing["random_sampling"] > 0 %} generate_request_id: true tracing: provider: name: envoy.tracers.opentelemetry typed_config: "@type": type.googleapis.com/envoy.config.trace.v3.OpenTelemetryConfig grpc_service: envoy_grpc: cluster_name: opentelemetry_collector timeout: 0.250s service_name: arch_gateway random_sampling: value: {{ arch_tracing.random_sampling }} {% endif %} stat_prefix: arch_listener_http codec_type: AUTO scheme_header_transformation: scheme_to_overwrite: https access_log: - name: envoy.access_loggers.file typed_config: "@type": type.googleapis.com/envoy.extensions.access_loggers.file.v3.FileAccessLog path: "/var/log/access_ingress.log" route_config: name: local_routes virtual_hosts: - name: local_service domains: - "*" routes: - match: prefix: "/" route: auto_host_rewrite: true cluster: arch_prompt_gateway_listener timeout: 60s http_filters: - name: envoy.filters.http.router typed_config: "@type": type.googleapis.com/envoy.extensions.filters.http.router.v3.Router - name: arch_prompt_gateway_listener address: socket_address: address: 0.0.0.0 port_value: 10001 traffic_direction: INBOUND filter_chains: - filters: - name: envoy.filters.network.http_connection_manager typed_config: "@type": type.googleapis.com/envoy.extensions.filters.network.http_connection_manager.v3.HttpConnectionManager {% if "random_sampling" in arch_tracing and arch_tracing["random_sampling"] > 0 %} generate_request_id: true tracing: provider: name: envoy.tracers.opentelemetry typed_config: "@type": type.googleapis.com/envoy.config.trace.v3.OpenTelemetryConfig grpc_service: envoy_grpc: cluster_name: opentelemetry_collector timeout: 0.250s service_name: prompt_processor random_sampling: value: {{ arch_tracing.random_sampling }} {% endif %} stat_prefix: arch_prompt_gateway_listener codec_type: AUTO scheme_header_transformation: scheme_to_overwrite: https access_log: - name: envoy.access_loggers.file typed_config: "@type": type.googleapis.com/envoy.extensions.access_loggers.file.v3.FileAccessLog path: "/var/log/access_ingress_prompt.log" route_config: name: local_routes virtual_hosts: - name: local_service domains: - "*" routes: {% for provider in arch_llm_providers %} # if endpoint is set then use custom cluster for upstream llm {% if provider.endpoint %} {% set llm_cluster_name = provider.name %} {% else %} {% set llm_cluster_name = provider.provider_interface %} {% endif %} - match: prefix: "/" headers: - name: "x-arch-llm-provider" string_match: exact: {{ llm_cluster_name }} route: auto_host_rewrite: true cluster: {{ llm_cluster_name }} timeout: 60s {% endfor %} http_filters: - name: envoy.filters.http.compressor typed_config: "@type": type.googleapis.com/envoy.extensions.filters.http.compressor.v3.Compressor compressor_library: name: compress typed_config: "@type": type.googleapis.com/envoy.extensions.compression.gzip.compressor.v3.Gzip memory_level: 3 window_bits: 10 - name: envoy.filters.http.wasm_prompt typed_config: "@type": type.googleapis.com/udpa.type.v1.TypedStruct type_url: type.googleapis.com/envoy.extensions.filters.http.wasm.v3.Wasm value: config: name: "http_config" root_id: prompt_gateway configuration: "@type": "type.googleapis.com/google.protobuf.StringValue" value: | {{ arch_config | indent(32) }} vm_config: runtime: "envoy.wasm.runtime.v8" code: local: filename: "/etc/envoy/proxy-wasm-plugins/prompt_gateway.wasm" - name: envoy.filters.http.wasm_llm typed_config: "@type": type.googleapis.com/udpa.type.v1.TypedStruct type_url: type.googleapis.com/envoy.extensions.filters.http.wasm.v3.Wasm value: config: name: "http_config" root_id: llm_gateway configuration: "@type": "type.googleapis.com/google.protobuf.StringValue" value: | {{ arch_llm_config | indent(32) }} vm_config: runtime: "envoy.wasm.runtime.v8" code: local: filename: "/etc/envoy/proxy-wasm-plugins/llm_gateway.wasm" - name: envoy.filters.http.decompressor typed_config: "@type": type.googleapis.com/envoy.extensions.filters.http.decompressor.v3.Decompressor decompressor_library: name: decompress typed_config: "@type": "type.googleapis.com/envoy.extensions.compression.gzip.decompressor.v3.Gzip" window_bits: 9 chunk_size: 8192 # If this ratio is set too low, then body data will not be decompressed completely. max_inflate_ratio: 1000 - name: envoy.filters.http.router typed_config: "@type": type.googleapis.com/envoy.extensions.filters.http.router.v3.Router - name: arch_internal address: socket_address: address: 0.0.0.0 port_value: 11000 traffic_direction: OUTBOUND filter_chains: - filters: - name: envoy.filters.network.http_connection_manager typed_config: "@type": type.googleapis.com/envoy.extensions.filters.network.http_connection_manager.v3.HttpConnectionManager {% if "random_sampling" in arch_tracing and arch_tracing["random_sampling"] > 0 %} generate_request_id: true tracing: provider: name: envoy.tracers.opentelemetry typed_config: "@type": type.googleapis.com/envoy.config.trace.v3.OpenTelemetryConfig grpc_service: envoy_grpc: cluster_name: opentelemetry_collector timeout: 0.250s service_name: prompt_processor random_sampling: value: {{ arch_tracing.random_sampling }} {% endif %} stat_prefix: arch_internal codec_type: AUTO scheme_header_transformation: scheme_to_overwrite: https access_log: - name: envoy.access_loggers.file typed_config: "@type": type.googleapis.com/envoy.extensions.access_loggers.file.v3.FileAccessLog path: "/var/log/access_internal.log" route_config: name: local_routes virtual_hosts: - name: local_service domains: - "*" routes: {% for internal_cluster in ["arch_fc", "model_server"] %} - match: prefix: "/" headers: - name: "x-arch-upstream" string_match: exact: {{ internal_cluster }} route: auto_host_rewrite: true cluster: {{ internal_cluster }} timeout: 60s {% endfor %} {% for cluster_name, cluster in arch_clusters.items() %} - match: prefix: "/" headers: - name: "x-arch-upstream" string_match: exact: {{ cluster_name }} route: auto_host_rewrite: true cluster: {{ cluster_name }} timeout: 60s {% endfor %} http_filters: - name: envoy.filters.http.router typed_config: "@type": type.googleapis.com/envoy.extensions.filters.http.router.v3.Router - name: arch_listener_http_llm address: socket_address: address: 0.0.0.0 port_value: 12000 traffic_direction: INBOUND filter_chains: - filters: - name: envoy.filters.network.http_connection_manager typed_config: "@type": type.googleapis.com/envoy.extensions.filters.network.http_connection_manager.v3.HttpConnectionManager {% if "random_sampling" in arch_tracing and arch_tracing["random_sampling"] > 0 %} generate_request_id: true tracing: provider: name: envoy.tracers.opentelemetry typed_config: "@type": type.googleapis.com/envoy.config.trace.v3.OpenTelemetryConfig grpc_service: envoy_grpc: cluster_name: opentelemetry_collector timeout: 0.250s service_name: arch_gateway random_sampling: value: {{ arch_tracing.random_sampling }} {% endif %} stat_prefix: arch_listener_http codec_type: AUTO scheme_header_transformation: scheme_to_overwrite: https access_log: - name: envoy.access_loggers.file typed_config: "@type": type.googleapis.com/envoy.extensions.access_loggers.file.v3.FileAccessLog path: "/var/log/access_llm.log" route_config: name: local_routes virtual_hosts: - name: local_service domains: - "*" routes: - match: prefix: "/" route: auto_host_rewrite: true cluster: arch_listener_llm timeout: 60s http_filters: - name: envoy.filters.http.router typed_config: "@type": type.googleapis.com/envoy.extensions.filters.http.router.v3.Router - name: arch_listener_llm address: socket_address: address: 0.0.0.0 port_value: 12001 filter_chains: - filters: - name: envoy.filters.network.http_connection_manager typed_config: "@type": type.googleapis.com/envoy.extensions.filters.network.http_connection_manager.v3.HttpConnectionManager {% if "random_sampling" in arch_tracing and arch_tracing["random_sampling"] > 0 %} generate_request_id: true tracing: provider: name: envoy.tracers.opentelemetry typed_config: "@type": type.googleapis.com/envoy.config.trace.v3.OpenTelemetryConfig grpc_service: envoy_grpc: cluster_name: opentelemetry_collector timeout: 0.250s service_name: llm_gateway random_sampling: value: {{ arch_tracing.random_sampling }} {% endif %} stat_prefix: arch_listener_http codec_type: AUTO scheme_header_transformation: scheme_to_overwrite: https access_log: - name: envoy.access_loggers.file typed_config: "@type": type.googleapis.com/envoy.extensions.access_loggers.file.v3.FileAccessLog path: "/var/log/access_llm.log" route_config: name: local_routes virtual_hosts: - name: local_service domains: - "*" routes: - match: prefix: "/healthz" route: auto_host_rewrite: true cluster: openai timeout: 60s {% for provider in arch_llm_providers %} # if endpoint is set then use custom cluster for upstream llm {% if provider.endpoint %} {% set llm_cluster_name = provider.name %} {% else %} {% set llm_cluster_name = provider.provider_interface %} {% endif %} - match: prefix: "/" headers: - name: "x-arch-llm-provider" string_match: exact: {{ llm_cluster_name }} route: auto_host_rewrite: true cluster: {{ llm_cluster_name }} timeout: 60s {% endfor %} - match: prefix: "/" direct_response: status: 400 body: inline_string: "x-arch-llm-provider header not set, llm gateway cannot perform routing\n" http_filters: - name: envoy.filters.http.compressor typed_config: "@type": type.googleapis.com/envoy.extensions.filters.http.compressor.v3.Compressor compressor_library: name: compress typed_config: "@type": type.googleapis.com/envoy.extensions.compression.gzip.compressor.v3.Gzip memory_level: 3 window_bits: 10 - name: envoy.filters.http.wasm typed_config: "@type": type.googleapis.com/udpa.type.v1.TypedStruct type_url: type.googleapis.com/envoy.extensions.filters.http.wasm.v3.Wasm value: config: name: "http_config" root_id: llm_gateway configuration: "@type": "type.googleapis.com/google.protobuf.StringValue" value: | {{ arch_llm_config | indent(32) }} vm_config: runtime: "envoy.wasm.runtime.v8" code: local: filename: "/etc/envoy/proxy-wasm-plugins/llm_gateway.wasm" - name: envoy.filters.http.decompressor typed_config: "@type": type.googleapis.com/envoy.extensions.filters.http.decompressor.v3.Decompressor decompressor_library: name: decompress typed_config: "@type": "type.googleapis.com/envoy.extensions.compression.gzip.decompressor.v3.Gzip" window_bits: 9 chunk_size: 8192 # If this ratio is set too low, then body data will not be decompressed completely. max_inflate_ratio: 1000 - name: envoy.filters.http.router typed_config: "@type": type.googleapis.com/envoy.extensions.filters.http.router.v3.Router clusters: - name: openai connect_timeout: 5s type: LOGICAL_DNS dns_lookup_family: V4_ONLY lb_policy: ROUND_ROBIN load_assignment: cluster_name: openai endpoints: - lb_endpoints: - endpoint: address: socket_address: address: api.openai.com port_value: 443 hostname: "api.openai.com" transport_socket: name: envoy.transport_sockets.tls typed_config: "@type": type.googleapis.com/envoy.extensions.transport_sockets.tls.v3.UpstreamTlsContext sni: api.openai.com common_tls_context: tls_params: tls_minimum_protocol_version: TLSv1_2 tls_maximum_protocol_version: TLSv1_3 - name: mistral connect_timeout: 5s type: LOGICAL_DNS dns_lookup_family: V4_ONLY lb_policy: ROUND_ROBIN load_assignment: cluster_name: mistral endpoints: - lb_endpoints: - endpoint: address: socket_address: address: api.mistral.ai port_value: 443 hostname: "api.mistral.ai" transport_socket: name: envoy.transport_sockets.tls typed_config: "@type": type.googleapis.com/envoy.extensions.transport_sockets.tls.v3.UpstreamTlsContext sni: api.mistral.ai {% for internal_clustrer in ["arch_fc", "model_server"] %} - name: {{ internal_clustrer }} connect_timeout: 5s type: STRICT_DNS dns_lookup_family: V4_ONLY lb_policy: ROUND_ROBIN load_assignment: cluster_name: {{ internal_clustrer }} endpoints: - lb_endpoints: - endpoint: address: socket_address: address: host.docker.internal port_value: $MODEL_SERVER_PORT hostname: {{ internal_clustrer }} {% endfor %} - name: mistral_7b_instruct connect_timeout: 5s type: STRICT_DNS dns_lookup_family: V4_ONLY lb_policy: ROUND_ROBIN load_assignment: cluster_name: mistral_7b_instruct endpoints: - lb_endpoints: - endpoint: address: socket_address: address: mistral_7b_instruct port_value: 10001 hostname: "mistral_7b_instruct" {% for cluster_name, cluster in arch_clusters.items() %} - name: {{ cluster_name }} {% if cluster.connect_timeout -%} connect_timeout: {{ cluster.connect_timeout }} {% else -%} connect_timeout: 5s {% endif -%} type: LOGICAL_DNS dns_lookup_family: V4_ONLY lb_policy: ROUND_ROBIN load_assignment: cluster_name: {{ cluster_name }} endpoints: - lb_endpoints: - endpoint: address: socket_address: address: {{ cluster.endpoint }} port_value: {{ cluster.port }} hostname: {{ cluster.endpoint }} {% if cluster.protocol == "https" %} transport_socket: name: envoy.transport_sockets.tls typed_config: "@type": type.googleapis.com/envoy.extensions.transport_sockets.tls.v3.UpstreamTlsContext sni: {{ cluster.endpoint }} common_tls_context: tls_params: tls_minimum_protocol_version: TLSv1_2 tls_maximum_protocol_version: TLSv1_3 {% endif %} {% endfor %} {% for local_llm_provider in local_llms %} - name: {{ local_llm_provider.name }} connect_timeout: 5s type: LOGICAL_DNS dns_lookup_family: V4_ONLY lb_policy: ROUND_ROBIN load_assignment: cluster_name: {{ local_llm_provider.name }} endpoints: - lb_endpoints: - endpoint: address: socket_address: address: {{ local_llm_provider.endpoint }} port_value: {{ local_llm_provider.port }} hostname: {{ local_llm_provider.endpoint }} {% endfor %} - name: arch_internal connect_timeout: 5s type: LOGICAL_DNS dns_lookup_family: V4_ONLY lb_policy: ROUND_ROBIN load_assignment: cluster_name: arch_internal endpoints: - lb_endpoints: - endpoint: address: socket_address: address: 0.0.0.0 port_value: 11000 hostname: arch_internal - name: arch_prompt_gateway_listener connect_timeout: 5s type: LOGICAL_DNS dns_lookup_family: V4_ONLY lb_policy: ROUND_ROBIN load_assignment: cluster_name: arch_prompt_gateway_listener endpoints: - lb_endpoints: - endpoint: address: socket_address: address: 0.0.0.0 port_value: 10001 hostname: arch_prompt_gateway_listener - name: arch_listener_llm connect_timeout: 5s type: LOGICAL_DNS dns_lookup_family: V4_ONLY lb_policy: ROUND_ROBIN load_assignment: cluster_name: arch_listener_llm endpoints: - lb_endpoints: - endpoint: address: socket_address: address: 0.0.0.0 port_value: 12001 hostname: arch_listener_llm {% if "random_sampling" in arch_tracing and arch_tracing["random_sampling"] > 0 %} - name: opentelemetry_collector type: STRICT_DNS dns_lookup_family: V4_ONLY lb_policy: ROUND_ROBIN typed_extension_protocol_options: envoy.extensions.upstreams.http.v3.HttpProtocolOptions: "@type": type.googleapis.com/envoy.extensions.upstreams.http.v3.HttpProtocolOptions explicit_http_config: http2_protocol_options: {} load_assignment: cluster_name: opentelemetry_collector endpoints: - lb_endpoints: - endpoint: address: socket_address: address: host.docker.internal port_value: 4317 - name: opentelemetry_collector_http type: STRICT_DNS dns_lookup_family: V4_ONLY lb_policy: ROUND_ROBIN typed_extension_protocol_options: load_assignment: cluster_name: opentelemetry_collector_http endpoints: - lb_endpoints: - endpoint: address: socket_address: address: host.docker.internal port_value: 4318 {% endif %}