mirror of
https://github.com/katanemo/plano.git
synced 2026-05-15 11:02:39 +02:00
Improve end to end tracing (#628)
* adding canonical tracing support via bright-staff * improved formatting for tools in the traces * removing anthropic from the currency exchange demo * using Envoy to transport traces, not calling OTEL directly * moving otel collcetor cluster outside tracing if/else * minor fixes to not write to the OTEL collector if tracing is disabled * fixed PR comments and added more trace attributes * more fixes based on PR comments * more clean up based on PR comments --------- Co-authored-by: Salman Paracha <salmanparacha@MacBook-Pro-342.local>
This commit is contained in:
parent
8adb9795d8
commit
a79f55f313
34 changed files with 2556 additions and 403 deletions
|
|
@ -51,11 +51,11 @@ static_resources:
|
|||
envoy_grpc:
|
||||
cluster_name: opentelemetry_collector
|
||||
timeout: 0.250s
|
||||
service_name: archgw(inbound)
|
||||
service_name: plano(inbound)
|
||||
random_sampling:
|
||||
value: {{ arch_tracing.random_sampling }}
|
||||
{% endif %}
|
||||
stat_prefix: ingress_traffic
|
||||
stat_prefix: plano(inbound)
|
||||
codec_type: AUTO
|
||||
scheme_header_transformation:
|
||||
scheme_to_overwrite: https
|
||||
|
|
@ -95,21 +95,6 @@ static_resources:
|
|||
- name: envoy.filters.network.http_connection_manager
|
||||
typed_config:
|
||||
"@type": type.googleapis.com/envoy.extensions.filters.network.http_connection_manager.v3.HttpConnectionManager
|
||||
{% if "random_sampling" in arch_tracing and arch_tracing["random_sampling"] > 0 %}
|
||||
generate_request_id: true
|
||||
tracing:
|
||||
provider:
|
||||
name: envoy.tracers.opentelemetry
|
||||
typed_config:
|
||||
"@type": type.googleapis.com/envoy.config.trace.v3.OpenTelemetryConfig
|
||||
grpc_service:
|
||||
envoy_grpc:
|
||||
cluster_name: opentelemetry_collector
|
||||
timeout: 0.250s
|
||||
service_name: ingress_traffic
|
||||
random_sampling:
|
||||
value: {{ arch_tracing.random_sampling }}
|
||||
{% endif %}
|
||||
stat_prefix: ingress_traffic
|
||||
codec_type: AUTO
|
||||
scheme_header_transformation:
|
||||
|
|
@ -221,7 +206,7 @@ static_resources:
|
|||
- name: outbound_api_traffic
|
||||
address:
|
||||
socket_address:
|
||||
address: 0.0.0.0
|
||||
address: 127.0.0.1
|
||||
port_value: 11000
|
||||
traffic_direction: OUTBOUND
|
||||
filter_chains:
|
||||
|
|
@ -240,7 +225,7 @@ static_resources:
|
|||
envoy_grpc:
|
||||
cluster_name: opentelemetry_collector
|
||||
timeout: 0.250s
|
||||
service_name: outbound_api_traffic
|
||||
service_name: tools
|
||||
random_sampling:
|
||||
value: {{ arch_tracing.random_sampling }}
|
||||
{% endif %}
|
||||
|
|
@ -413,7 +398,7 @@ static_resources:
|
|||
envoy_grpc:
|
||||
cluster_name: opentelemetry_collector
|
||||
timeout: 0.250s
|
||||
service_name: archgw(outbound)
|
||||
service_name: plano(outbound)
|
||||
random_sampling:
|
||||
value: {{ arch_tracing.random_sampling }}
|
||||
{% endif %}
|
||||
|
|
@ -484,6 +469,50 @@ static_resources:
|
|||
typed_config:
|
||||
"@type": type.googleapis.com/envoy.extensions.filters.http.router.v3.Router
|
||||
|
||||
{% if "random_sampling" in arch_tracing and arch_tracing["random_sampling"] > 0 %}
|
||||
- name: otel_collector_proxy
|
||||
address:
|
||||
socket_address:
|
||||
address: 127.0.0.1
|
||||
port_value: 9903
|
||||
traffic_direction: OUTBOUND
|
||||
filter_chains:
|
||||
- filters:
|
||||
- name: envoy.filters.network.http_connection_manager
|
||||
typed_config:
|
||||
"@type": type.googleapis.com/envoy.extensions.filters.network.http_connection_manager.v3.HttpConnectionManager
|
||||
stat_prefix: otel_proxy
|
||||
codec_type: AUTO
|
||||
access_log:
|
||||
- name: envoy.access_loggers.file
|
||||
typed_config:
|
||||
"@type": type.googleapis.com/envoy.extensions.access_loggers.file.v3.FileAccessLog
|
||||
path: "/var/log/access_otel.log"
|
||||
format: |
|
||||
[%START_TIME%] "%REQ(:METHOD)% %REQ(X-ENVOY-ORIGINAL-PATH?:PATH)% %PROTOCOL%" %RESPONSE_CODE% %RESPONSE_FLAGS% %BYTES_RECEIVED% %BYTES_SENT% %DURATION% %RESP(X-ENVOY-UPSTREAM-SERVICE-TIME)% "%REQ(X-FORWARDED-FOR)%" "%REQ(USER-AGENT)%" "%REQ(X-REQUEST-ID)%" "%REQ(:AUTHORITY)%" "%UPSTREAM_HOST%" "%UPSTREAM_CLUSTER%"
|
||||
route_config:
|
||||
name: otel_route
|
||||
virtual_hosts:
|
||||
- name: otel_backend
|
||||
domains: ["*"]
|
||||
routes:
|
||||
- match:
|
||||
prefix: "/v1/traces"
|
||||
route:
|
||||
cluster: opentelemetry_collector_http
|
||||
timeout: 5s
|
||||
retry_policy:
|
||||
retry_on: "5xx,connect-failure,refused-stream,reset"
|
||||
num_retries: 3
|
||||
per_try_timeout: 2s
|
||||
host_selection_retry_max_attempts: 5
|
||||
retriable_status_codes: [500, 502, 503, 504]
|
||||
http_filters:
|
||||
- name: envoy.filters.http.router
|
||||
typed_config:
|
||||
"@type": type.googleapis.com/envoy.extensions.filters.http.router.v3.Router
|
||||
{% endif %}
|
||||
|
||||
- name: egress_traffic_llm
|
||||
address:
|
||||
socket_address:
|
||||
|
|
@ -1014,7 +1043,6 @@ static_resources:
|
|||
port_value: 12001
|
||||
hostname: arch_listener_llm
|
||||
|
||||
|
||||
{% if "random_sampling" in arch_tracing and arch_tracing["random_sampling"] > 0 %}
|
||||
- name: opentelemetry_collector
|
||||
type: STRICT_DNS
|
||||
|
|
@ -1048,4 +1076,19 @@ static_resources:
|
|||
socket_address:
|
||||
address: host.docker.internal
|
||||
port_value: 4318
|
||||
# Circuit breaker configuration to prevent overwhelming OTEL collector
|
||||
circuit_breakers:
|
||||
thresholds:
|
||||
- priority: DEFAULT
|
||||
max_connections: 100
|
||||
max_pending_requests: 100
|
||||
max_requests: 100
|
||||
max_retries: 3
|
||||
# Health checking and outlier detection
|
||||
outlier_detection:
|
||||
consecutive_5xx: 5
|
||||
interval: 10s
|
||||
base_ejection_time: 30s
|
||||
max_ejection_percent: 50
|
||||
enforcing_consecutive_5xx: 100
|
||||
{% endif %}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue