diff --git a/arch/envoy.template.yaml b/arch/envoy.template.yaml index 4c716e3a..235807d5 100644 --- a/arch/envoy.template.yaml +++ b/arch/envoy.template.yaml @@ -1,398 +1,361 @@ -admin: - address: - socket_address: { address: 0.0.0.0, port_value: 9901 } -static_resources: - listeners: - - name: arch_listener_http - address: - socket_address: - address: 0.0.0.0 - port_value: 10000 - traffic_direction: INBOUND - filter_chains: - - filters: - - name: envoy.filters.network.http_connection_manager - typed_config: - "@type": type.googleapis.com/envoy.extensions.filters.network.http_connection_manager.v3.HttpConnectionManager - {% if "random_sampling" in arch_tracing and arch_tracing["random_sampling"] > 0 %} - generate_request_id: true - tracing: - provider: - name: envoy.tracers.opentelemetry - typed_config: - "@type": type.googleapis.com/envoy.config.trace.v3.OpenTelemetryConfig - grpc_service: - envoy_grpc: - cluster_name: opentelemetry_collector - timeout: 0.250s - service_name: arch - random_sampling: - value: {{ arch_tracing.random_sampling }} - {% endif %} - stat_prefix: arch_listener_http - codec_type: AUTO - scheme_header_transformation: - scheme_to_overwrite: https - access_log: - - name: envoy.access_loggers.file - typed_config: - "@type": type.googleapis.com/envoy.extensions.access_loggers.file.v3.FileAccessLog - path: "/var/log/access_ingress.log" - route_config: - name: local_routes - virtual_hosts: - - name: local_service - domains: - - "*" - routes: - - match: - prefix: "/" - route: - auto_host_rewrite: true - cluster: arch_llm_listener - timeout: 60s - http_filters: - - name: envoy.filters.http.wasm - typed_config: - "@type": type.googleapis.com/udpa.type.v1.TypedStruct - type_url: type.googleapis.com/envoy.extensions.filters.http.wasm.v3.Wasm - value: - config: - name: "http_config" - root_id: prompt_gateway - configuration: - "@type": "type.googleapis.com/google.protobuf.StringValue" - value: | - {{ arch_config | indent(32) }} - vm_config: - runtime: "envoy.wasm.runtime.v8" - code: - local: - filename: "/etc/envoy/proxy-wasm-plugins/prompt_gateway.wasm" - - name: envoy.filters.http.router - typed_config: - "@type": type.googleapis.com/envoy.extensions.filters.http.router.v3.Router - - - name: arch_internal - address: - socket_address: - address: 0.0.0.0 - port_value: 11000 - traffic_direction: OUTBOUND - filter_chains: - - filters: - - name: envoy.filters.network.http_connection_manager - typed_config: - "@type": type.googleapis.com/envoy.extensions.filters.network.http_connection_manager.v3.HttpConnectionManager - {% if "random_sampling" in arch_tracing and arch_tracing["random_sampling"] > 0 %} - generate_request_id: true - tracing: - provider: - name: envoy.tracers.opentelemetry - typed_config: - "@type": type.googleapis.com/envoy.config.trace.v3.OpenTelemetryConfig - grpc_service: - envoy_grpc: - cluster_name: opentelemetry_collector - timeout: 0.250s - service_name: arch - random_sampling: - value: {{ arch_tracing.random_sampling }} - {% endif %} - stat_prefix: arch_internal - codec_type: AUTO - scheme_header_transformation: - scheme_to_overwrite: https - access_log: - - name: envoy.access_loggers.file - typed_config: - "@type": type.googleapis.com/envoy.extensions.access_loggers.file.v3.FileAccessLog - path: "/var/log/access_internal.log" - route_config: - name: local_routes - virtual_hosts: - - name: local_service - domains: - - "*" - routes: - - match: - prefix: "/" - headers: - - name: "x-arch-upstream" - string_match: - exact: model_server - route: - auto_host_rewrite: true - cluster: model_server - timeout: 60s - - match: - prefix: "/" - headers: - - name: "x-arch-upstream" - string_match: - exact: arch_fc - route: - auto_host_rewrite: true - cluster: model_server - timeout: 60s - {% for _, cluster in arch_clusters.items() %} - - match: - prefix: "/" - headers: - - name: "x-arch-upstream" - string_match: - exact: {{ cluster.name }} - route: - auto_host_rewrite: true - cluster: {{ cluster.name }} - timeout: 60s - {% endfor %} - http_filters: - - name: envoy.filters.http.router - typed_config: - "@type": type.googleapis.com/envoy.extensions.filters.http.router.v3.Router - - - name: arch_listener_llm - address: - socket_address: - address: 0.0.0.0 - port_value: 12000 - filter_chains: - - filters: - - name: envoy.filters.network.http_connection_manager - typed_config: - "@type": type.googleapis.com/envoy.extensions.filters.network.http_connection_manager.v3.HttpConnectionManager - {% if "random_sampling" in arch_tracing and arch_tracing["random_sampling"] > 0 %} - generate_request_id: true - tracing: - provider: - name: envoy.tracers.opentelemetry - typed_config: - "@type": type.googleapis.com/envoy.config.trace.v3.OpenTelemetryConfig - grpc_service: - envoy_grpc: - cluster_name: opentelemetry_collector - timeout: 0.250s - service_name: arch - random_sampling: - value: {{ arch_tracing.random_sampling }} - {% endif %} - stat_prefix: arch_listener_http - codec_type: AUTO - scheme_header_transformation: - scheme_to_overwrite: https - access_log: - - name: envoy.access_loggers.file - typed_config: - "@type": type.googleapis.com/envoy.extensions.access_loggers.file.v3.FileAccessLog - path: "/var/log/access_llm.log" - route_config: - name: local_routes - virtual_hosts: - - name: local_service - domains: - - "*" - routes: - {% for provider in arch_llm_providers %} - - match: - prefix: "/" - headers: - - name: "x-arch-llm-provider" - string_match: - exact: {{ provider.name }} - route: - auto_host_rewrite: true - cluster: {{ provider.provider }} - timeout: 60s - {% endfor %} - - match: - prefix: "/" - direct_response: - status: 400 - body: - inline_string: "x-arch-llm-provider header not set, llm gateway cannot perform routing\n" - http_filters: - - name: envoy.filters.http.wasm - typed_config: - "@type": type.googleapis.com/udpa.type.v1.TypedStruct - type_url: type.googleapis.com/envoy.extensions.filters.http.wasm.v3.Wasm - value: - config: - name: "http_config" - root_id: llm_gateway - configuration: - "@type": "type.googleapis.com/google.protobuf.StringValue" - value: | - {{ arch_llm_config | indent(32) }} - vm_config: - runtime: "envoy.wasm.runtime.v8" - code: - local: - filename: "/etc/envoy/proxy-wasm-plugins/llm_gateway.wasm" - - name: envoy.filters.http.router - typed_config: - "@type": type.googleapis.com/envoy.extensions.filters.http.router.v3.Router - - clusters: - - name: openai - connect_timeout: 5s - type: LOGICAL_DNS - dns_lookup_family: V4_ONLY - lb_policy: ROUND_ROBIN - load_assignment: - cluster_name: openai - endpoints: - - lb_endpoints: - - endpoint: - address: - socket_address: - address: api.openai.com - port_value: 443 - hostname: "api.openai.com" - transport_socket: - name: envoy.transport_sockets.tls - typed_config: - "@type": type.googleapis.com/envoy.extensions.transport_sockets.tls.v3.UpstreamTlsContext - sni: api.openai.com - common_tls_context: - tls_params: - tls_minimum_protocol_version: TLSv1_2 - tls_maximum_protocol_version: TLSv1_3 - - name: mistral - connect_timeout: 5s - type: LOGICAL_DNS - dns_lookup_family: V4_ONLY - lb_policy: ROUND_ROBIN - load_assignment: - cluster_name: mistral - endpoints: - - lb_endpoints: - - endpoint: - address: - socket_address: - address: api.mistral.ai - port_value: 443 - hostname: "api.mistral.ai" - transport_socket: - name: envoy.transport_sockets.tls - typed_config: - "@type": type.googleapis.com/envoy.extensions.transport_sockets.tls.v3.UpstreamTlsContext - sni: api.mistral.ai - - name: model_server - connect_timeout: 5s - type: STRICT_DNS - dns_lookup_family: V4_ONLY - lb_policy: ROUND_ROBIN - load_assignment: - cluster_name: model_server - endpoints: - - lb_endpoints: - - endpoint: - address: - socket_address: - address: host.docker.internal - port_value: 51000 - hostname: "model_server" - - name: mistral_7b_instruct - connect_timeout: 5s - type: STRICT_DNS - dns_lookup_family: V4_ONLY - lb_policy: ROUND_ROBIN - load_assignment: - cluster_name: mistral_7b_instruct - endpoints: - - lb_endpoints: - - endpoint: - address: - socket_address: - address: mistral_7b_instruct - port_value: 10001 - hostname: "mistral_7b_instruct" - - name: arch_fc - connect_timeout: 5s - type: STRICT_DNS - dns_lookup_family: V4_ONLY - lb_policy: ROUND_ROBIN - load_assignment: - cluster_name: arch_fc - endpoints: - - lb_endpoints: - - endpoint: - address: - socket_address: - address: host.docker.internal - port_value: 51000 - hostname: "arch_fc" -{% for _, cluster in arch_clusters.items() %} - - name: {{ cluster.name }} - {% if cluster.connect_timeout -%} - connect_timeout: {{ cluster.connect_timeout }} - {% else -%} - connect_timeout: 5s - {% endif -%} - type: LOGICAL_DNS - dns_lookup_family: V4_ONLY - lb_policy: ROUND_ROBIN - load_assignment: - cluster_name: {{ cluster.name }} - endpoints: - - lb_endpoints: - - endpoint: - address: - socket_address: - address: {{ cluster.endpoint }} - port_value: {{ cluster.port }} - hostname: {{ cluster.name }} -{% endfor %} - - name: arch_internal - connect_timeout: 5s - type: LOGICAL_DNS - dns_lookup_family: V4_ONLY - lb_policy: ROUND_ROBIN - load_assignment: - cluster_name: arch_internal - endpoints: - - lb_endpoints: - - endpoint: - address: - socket_address: - address: 0.0.0.0 - port_value: 11000 - hostname: arch_internal - - - name: arch_llm_listener - connect_timeout: 5s - type: LOGICAL_DNS - dns_lookup_family: V4_ONLY - lb_policy: ROUND_ROBIN - load_assignment: - cluster_name: arch_llm_listener - endpoints: - - lb_endpoints: - - endpoint: - address: - socket_address: - address: 0.0.0.0 - port_value: 12000 - hostname: arch_llm_listener - -{% if "random_sampling" in arch_tracing and arch_tracing["random_sampling"] > 0 %} - - name: opentelemetry_collector - type: STRICT_DNS - dns_lookup_family: V4_ONLY - lb_policy: ROUND_ROBIN - typed_extension_protocol_options: - envoy.extensions.upstreams.http.v3.HttpProtocolOptions: - "@type": type.googleapis.com/envoy.extensions.upstreams.http.v3.HttpProtocolOptions - explicit_http_config: - http2_protocol_options: {} - load_assignment: - cluster_name: opentelemetry_collector - endpoints: - - lb_endpoints: - - endpoint: - address: - socket_address: - address: host.docker.internal - port_value: 4317 -{% endif %} +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 1, + "links": [], + "panels": [ + { + "datasource": { + "default": true, + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "bars", + "fillOpacity": 54, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 0 + }, + "id": 1, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "disableTextWrap": false, + "editorMode": "code", + "expr": "avg(rate(envoy_cluster_external_upstream_rq_time_sum{envoy_cluster_name!=\"openai\",envoy_cluster_name!=\"arch_llm_listener\"}[1m]) / rate(envoy_cluster_external_upstream_rq_time_count[1m])) by (envoy_cluster_name)", + "fullMetaSearch": false, + "hide": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "request latency - internal (ms)", + "type": "timeseries" + }, + { + "datasource": { + "default": true, + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 0 + }, + "id": 4, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "disableTextWrap": false, + "editorMode": "code", + "expr": "avg(rate(envoy_cluster_external_upstream_rq_time_sum{envoy_cluster_name!=\"hallucination\", envoy_cluster_name!=\"zeroshot\", envoy_cluster_name!=\"embeddings\", envoy_cluster_name!=\"arch_fc\", envoy_cluster_name!=\"api_server\"}[1m]) / rate(envoy_cluster_external_upstream_rq_time_count[1m])) by (envoy_cluster_name)", + "fullMetaSearch": false, + "hide": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "request latency - external (ms)", + "type": "timeseries" + }, + { + "datasource": { + "default": true, + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 8 + }, + "id": 3, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "disableTextWrap": false, + "editorMode": "code", + "expr": "avg(rate(envoy_cluster_internal_upstream_rq_completed{envoy_cluster_name !=\"opentelemetry_collector\"}[1m])) by (envoy_cluster_name)", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A", + "useBackend": false + }, + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "disableTextWrap": false, + "editorMode": "code", + "expr": "avg(rate(envoy_cluster_external_upstream_rq_completed{envoy_cluster_name !=\"opentelemetry_collector\"}[1m])) by (envoy_cluster_name)", + "fullMetaSearch": false, + "hide": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "B", + "useBackend": false + } + ], + "title": "Upstream request count", + "type": "timeseries" + } + ], + "schemaVersion": 39, + "tags": [], + "templating": { + "list": [] + }, + "time": { + "from": "now-15m", + "to": "now" + }, + "timepicker": {}, + "timezone": "browser", + "title": "Intelligent Gateway Overview", + "uid": "adt6uhx5lk8aob", + "version": 2, + "weekStart": "" +} diff --git a/crates/common/src/consts.rs b/crates/common/src/consts.rs index 76244f6b..badba77d 100644 --- a/crates/common/src/consts.rs +++ b/crates/common/src/consts.rs @@ -6,9 +6,13 @@ pub const RATELIMIT_SELECTOR_HEADER_KEY: &str = "x-arch-ratelimit-selector"; pub const SYSTEM_ROLE: &str = "system"; pub const USER_ROLE: &str = "user"; pub const GPT_35_TURBO: &str = "gpt-3.5-turbo"; -pub const ARC_FC_CLUSTER: &str = "arch_fc"; pub const ARCH_FC_REQUEST_TIMEOUT_MS: u64 = 120000; // 2 minutes pub const MODEL_SERVER_NAME: &str = "model_server"; +pub const ZEROSHOT_INTERNAL_HOST: &str = "zeroshot"; +pub const ARCH_FC_INTERNAL_HOST: &str = "arch_fc"; +pub const HALLUCINATION_INTERNAL_HOST: &str = "hallucination"; +pub const EMBEDDINGS_INTERNAL_HOST: &str = "embeddings"; +pub const GUARD_INTERNAL_HOST: &str = "guard"; pub const ARCH_ROUTING_HEADER: &str = "x-arch-llm-provider"; pub const ARCH_MESSAGES_KEY: &str = "arch_messages"; pub const ARCH_PROVIDER_HINT_HEADER: &str = "x-arch-llm-provider-hint"; diff --git a/crates/prompt_gateway/src/prompt_filter_context.rs b/crates/prompt_gateway/src/prompt_filter_context.rs index 0c25ee5c..5d7c1e71 100644 --- a/crates/prompt_gateway/src/prompt_filter_context.rs +++ b/crates/prompt_gateway/src/prompt_filter_context.rs @@ -1,10 +1,9 @@ use crate::prompt_stream_context::PromptStreamContext; use common::common_types::EmbeddingType; use common::configuration::{Configuration, GatewayMode, Overrides, PromptGuards, PromptTarget}; -use common::consts::ARCH_INTERNAL_CLUSTER_NAME; +use common::consts::{ARCH_INTERNAL_CLUSTER_NAME, EMBEDDINGS_INTERNAL_HOST}; use common::consts::ARCH_UPSTREAM_HOST_HEADER; use common::consts::DEFAULT_EMBEDDING_MODEL; -use common::consts::MODEL_SERVER_NAME; use common::embeddings::{ CreateEmbeddingRequest, CreateEmbeddingRequestInput, CreateEmbeddingResponse, }; @@ -105,10 +104,10 @@ impl PromptGatewayFilterContext { ARCH_INTERNAL_CLUSTER_NAME, "/embeddings", vec![ - (ARCH_UPSTREAM_HOST_HEADER, MODEL_SERVER_NAME), + (ARCH_UPSTREAM_HOST_HEADER, EMBEDDINGS_INTERNAL_HOST), (":method", "POST"), (":path", "/embeddings"), - (":authority", MODEL_SERVER_NAME), + (":authority", EMBEDDINGS_INTERNAL_HOST), ("content-type", "application/json"), ("x-envoy-upstream-rq-timeout-ms", "60000"), ], diff --git a/crates/prompt_gateway/src/prompt_stream_context.rs b/crates/prompt_gateway/src/prompt_stream_context.rs index d208f5e8..503ea8b5 100644 --- a/crates/prompt_gateway/src/prompt_stream_context.rs +++ b/crates/prompt_gateway/src/prompt_stream_context.rs @@ -12,11 +12,7 @@ use common::common_types::{ }; use common::configuration::{Overrides, PromptGuards, PromptTarget}; use common::consts::{ - ARCH_FC_MODEL_NAME, ARCH_FC_REQUEST_TIMEOUT_MS, ARCH_INTERNAL_CLUSTER_NAME, ARCH_MESSAGES_KEY, - ARCH_MODEL_PREFIX, ARCH_STATE_HEADER, ARCH_UPSTREAM_HOST_HEADER, ARC_FC_CLUSTER, - CHAT_COMPLETIONS_PATH, DEFAULT_EMBEDDING_MODEL, DEFAULT_HALLUCINATED_THRESHOLD, - DEFAULT_INTENT_MODEL, DEFAULT_PROMPT_TARGET_THRESHOLD, GPT_35_TURBO, MODEL_SERVER_NAME, - REQUEST_ID_HEADER, SYSTEM_ROLE, USER_ROLE, + ARCH_FC_MODEL_NAME, ARCH_FC_REQUEST_TIMEOUT_MS, ARCH_INTERNAL_CLUSTER_NAME, ARCH_MESSAGES_KEY, ARCH_MODEL_PREFIX, ARCH_STATE_HEADER, ARCH_UPSTREAM_HOST_HEADER, ARCH_FC_INTERNAL_HOST, CHAT_COMPLETIONS_PATH, DEFAULT_EMBEDDING_MODEL, DEFAULT_HALLUCINATED_THRESHOLD, DEFAULT_INTENT_MODEL, DEFAULT_PROMPT_TARGET_THRESHOLD, EMBEDDINGS_INTERNAL_HOST, GPT_35_TURBO, GUARD_INTERNAL_HOST, HALLUCINATION_INTERNAL_HOST, REQUEST_ID_HEADER, SYSTEM_ROLE, USER_ROLE, ZEROSHOT_INTERNAL_HOST }; use common::embeddings::{ CreateEmbeddingRequest, CreateEmbeddingRequestInput, CreateEmbeddingResponse, @@ -239,10 +235,10 @@ impl PromptStreamContext { }; let mut headers = vec![ - (ARCH_UPSTREAM_HOST_HEADER, MODEL_SERVER_NAME), + (ARCH_UPSTREAM_HOST_HEADER, ZEROSHOT_INTERNAL_HOST), (":method", "POST"), (":path", "/zeroshot"), - (":authority", MODEL_SERVER_NAME), + (":authority", ZEROSHOT_INTERNAL_HOST), ("content-type", "application/json"), ("x-envoy-max-retries", "3"), ("x-envoy-upstream-rq-timeout-ms", "60000"), @@ -545,9 +541,9 @@ impl PromptStreamContext { let mut headers = vec![ (":method", "POST"), - (ARCH_UPSTREAM_HOST_HEADER, ARC_FC_CLUSTER), + (ARCH_UPSTREAM_HOST_HEADER, ARCH_FC_INTERNAL_HOST), (":path", "/v1/chat/completions"), - (":authority", ARC_FC_CLUSTER), + (":authority", ARCH_FC_INTERNAL_HOST), ("content-type", "application/json"), ("x-envoy-max-retries", "3"), ("x-envoy-upstream-rq-timeout-ms", timeout_str.as_str()), @@ -698,10 +694,10 @@ impl PromptStreamContext { }; let mut headers = vec![ - (ARCH_UPSTREAM_HOST_HEADER, MODEL_SERVER_NAME), + (ARCH_UPSTREAM_HOST_HEADER, HALLUCINATION_INTERNAL_HOST), (":method", "POST"), (":path", "/hallucination"), - (":authority", MODEL_SERVER_NAME), + (":authority", HALLUCINATION_INTERNAL_HOST), ("content-type", "application/json"), ("x-envoy-max-retries", "3"), ("x-envoy-upstream-rq-timeout-ms", "60000"), @@ -921,10 +917,10 @@ impl PromptStreamContext { }; let mut headers = vec![ - (ARCH_UPSTREAM_HOST_HEADER, MODEL_SERVER_NAME), + (ARCH_UPSTREAM_HOST_HEADER, EMBEDDINGS_INTERNAL_HOST), (":method", "POST"), (":path", "/embeddings"), - (":authority", MODEL_SERVER_NAME), + (":authority", EMBEDDINGS_INTERNAL_HOST), ("content-type", "application/json"), ("x-envoy-max-retries", "3"), ("x-envoy-upstream-rq-timeout-ms", "60000"), @@ -1177,10 +1173,10 @@ impl HttpContext for PromptStreamContext { }; let mut headers = vec![ - (ARCH_UPSTREAM_HOST_HEADER, MODEL_SERVER_NAME), + (ARCH_UPSTREAM_HOST_HEADER, GUARD_INTERNAL_HOST), (":method", "POST"), (":path", "/guard"), - (":authority", MODEL_SERVER_NAME), + (":authority", GUARD_INTERNAL_HOST), ("content-type", "application/json"), ("x-envoy-max-retries", "3"), ("x-envoy-upstream-rq-timeout-ms", "60000"), diff --git a/demos/function_calling/grafana/dashboards/envoy_overview.json b/demos/function_calling/grafana/dashboards/envoy_overview.json index eca35419..235807d5 100644 --- a/demos/function_calling/grafana/dashboards/envoy_overview.json +++ b/demos/function_calling/grafana/dashboards/envoy_overview.json @@ -39,8 +39,8 @@ "axisPlacement": "auto", "barAlignment": 0, "barWidthFactor": 0.6, - "drawStyle": "line", - "fillOpacity": 0, + "drawStyle": "bars", + "fillOpacity": 54, "gradientMode": "none", "hideFrom": { "legend": false, @@ -58,7 +58,7 @@ "spanNulls": false, "stacking": { "group": "A", - "mode": "none" + "mode": "normal" }, "thresholdsStyle": { "mode": "off" @@ -79,32 +79,7 @@ ] } }, - "overrides": [ - { - "__systemRef": "hideSeriesFrom", - "matcher": { - "id": "byNames", - "options": { - "mode": "exclude", - "names": [ - "api_server" - ], - "prefix": "All except:", - "readOnly": true - } - }, - "properties": [ - { - "id": "custom.hideFrom", - "value": { - "legend": false, - "tooltip": false, - "viz": true - } - } - ] - } - ] + "overrides": [] }, "gridPos": { "h": 8, @@ -112,7 +87,7 @@ "x": 0, "y": 0 }, - "id": 3, + "id": 1, "options": { "legend": { "calcs": [], @@ -133,34 +108,18 @@ }, "disableTextWrap": false, "editorMode": "code", - "expr": "avg(rate(envoy_cluster_internal_upstream_rq_completed{envoy_cluster_name=~\"api_server|openai\"}[1m])) by (envoy_cluster_name)", - "fullMetaSearch": false, - "includeNullMetadata": true, - "instant": false, - "legendFormat": "__auto", - "range": true, - "refId": "A", - "useBackend": false - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "disableTextWrap": false, - "editorMode": "code", - "expr": "avg(rate(envoy_cluster_external_upstream_rq_completed{envoy_cluster_name=~\"api_server|openai\"}[1m])) by (envoy_cluster_name)", + "expr": "avg(rate(envoy_cluster_external_upstream_rq_time_sum{envoy_cluster_name!=\"openai\",envoy_cluster_name!=\"arch_llm_listener\"}[1m]) / rate(envoy_cluster_external_upstream_rq_time_count[1m])) by (envoy_cluster_name)", "fullMetaSearch": false, "hide": false, "includeNullMetadata": true, "instant": false, "legendFormat": "__auto", "range": true, - "refId": "B", + "refId": "A", "useBackend": false } ], - "title": "Upstream request rate", + "title": "request latency - internal (ms)", "type": "timeseries" }, { @@ -230,7 +189,7 @@ "x": 12, "y": 0 }, - "id": 1, + "id": 4, "options": { "legend": { "calcs": [], @@ -251,7 +210,7 @@ }, "disableTextWrap": false, "editorMode": "code", - "expr": "avg(rate (envoy_cluster_external_upstream_rq_time_sum{envoy_cluster_name=~\"api_server|openai\"}[1m])/ rate(envoy_cluster_external_upstream_rq_time_count{envoy_cluster_name=~\"api_server|openai\"}[1m])) by (envoy_cluster_name)", + "expr": "avg(rate(envoy_cluster_external_upstream_rq_time_sum{envoy_cluster_name!=\"hallucination\", envoy_cluster_name!=\"zeroshot\", envoy_cluster_name!=\"embeddings\", envoy_cluster_name!=\"arch_fc\", envoy_cluster_name!=\"api_server\"}[1m]) / rate(envoy_cluster_external_upstream_rq_time_count[1m])) by (envoy_cluster_name)", "fullMetaSearch": false, "hide": false, "includeNullMetadata": true, @@ -274,7 +233,40 @@ "fieldConfig": { "defaults": { "color": { - "mode": "thresholds" + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } }, "mappings": [], "thresholds": { @@ -299,25 +291,19 @@ "x": 0, "y": 8 }, - "id": 4, + "id": 3, "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "auto", - "orientation": "auto", - "percentChangeColorMode": "standard", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true }, - "showPercentChange": false, - "textMode": "auto", - "wideLayout": true + "tooltip": { + "mode": "single", + "sort": "none" + } }, - "pluginVersion": "11.2.0", "targets": [ { "datasource": { @@ -325,98 +311,38 @@ "uid": "PBFA97CFB590B2093" }, "disableTextWrap": false, - "editorMode": "builder", - "exemplar": false, - "expr": "envoy_cluster_upstream_rq_completed{envoy_cluster_name=~\"openai|api_server\"}", + "editorMode": "code", + "expr": "avg(rate(envoy_cluster_internal_upstream_rq_completed{envoy_cluster_name !=\"opentelemetry_collector\"}[1m])) by (envoy_cluster_name)", "fullMetaSearch": false, "includeNullMetadata": true, - "instant": true, - "legendFormat": "{{envoy_cluster_name}}", - "range": false, + "instant": false, + "legendFormat": "__auto", + "range": true, "refId": "A", "useBackend": false - } - ], - "title": "# of Completd Requests", - "type": "stat" - }, - { - "datasource": { - "default": true, - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - } }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 8 - }, - "id": 5, - "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "auto", - "orientation": "auto", - "percentChangeColorMode": "standard", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "showPercentChange": false, - "textMode": "auto", - "wideLayout": true - }, - "pluginVersion": "11.2.0", - "targets": [ { "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" }, "disableTextWrap": false, - "editorMode": "builder", - "exemplar": false, - "expr": "envoy_cluster_upstream_rq_cancelled{envoy_cluster_name=~\"api_server|openai\"} + envoy_cluster_upstream_rq_pending_failure_eject{envoy_cluster_name=~\"api_server|openai\"} + envoy_cluster_upstream_rq_pending_overflow{envoy_cluster_name=~\"api_server|openai\"}", + "editorMode": "code", + "expr": "avg(rate(envoy_cluster_external_upstream_rq_completed{envoy_cluster_name !=\"opentelemetry_collector\"}[1m])) by (envoy_cluster_name)", "fullMetaSearch": false, + "hide": false, "includeNullMetadata": true, - "instant": true, - "legendFormat": "{{envoy_cluster_name}}", - "range": false, - "refId": "A", + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "B", "useBackend": false } ], - "title": "# of Failed or Cancelled Requests", - "type": "stat" + "title": "Upstream request count", + "type": "timeseries" } ], - "refresh": "", "schemaVersion": 39, "tags": [], "templating": { @@ -428,8 +354,8 @@ }, "timepicker": {}, "timezone": "browser", - "title": "Arch Gateway Dashboard", + "title": "Intelligent Gateway Overview", "uid": "adt6uhx5lk8aob", - "version": 1, + "version": 2, "weekStart": "" }