Improve end to end tracing (#628)

* adding canonical tracing support via bright-staff * improved formatting for tools in the traces * removing anthropic from the currency exchange demo * using Envoy to transport traces, not calling OTEL directly * moving otel collcetor cluster outside tracing if/else * minor fixes to not write to the OTEL collector if tracing is disabled * fixed PR comments and added more trace attributes * more fixes based on PR comments * more clean up based on PR comments --------- Co-authored-by: Salman Paracha <salmanparacha@MacBook-Pro-342.local>
2026-05-08 07:12:42 +02:00 · 2025-12-11 15:21:57 -08:00 · 2025-12-11 15:21:57 -08:00 · a79f55f313
commit a79f55f313
parent 8adb9795d8
34 changed files with 2556 additions and 403 deletions
--- a/crates/llm_gateway/src/filter_context.rs
+++ b/crates/llm_gateway/src/filter_context.rs
@ -2,26 +2,18 @@ use crate::metrics::Metrics;
 use crate::stream_context::StreamContext;
 use common::configuration::Configuration;
 use common::configuration::Overrides;
-use common::consts::OTEL_COLLECTOR_HTTP;
-use common::consts::OTEL_POST_PATH;
-use common::http::CallArgs;
 use common::http::Client;
 use common::llm_providers::LlmProviders;
 use common::ratelimit;
 use common::stats::Gauge;
-use common::tracing::TraceData;
 use log::trace;
-use log::warn;
 use proxy_wasm::traits::*;
 use proxy_wasm::types::*;
 use std::cell::RefCell;
 use std::collections::HashMap;
-use std::collections::VecDeque;
 use std::rc::Rc;
 use std::time::Duration;

-use std::sync::{Arc, Mutex};
-
 #[derive(Debug)]
 pub struct CallContext {}

@ -31,7 +23,6 @@ pub struct FilterContext {
    // callouts stores token_id to request mapping that we use during #on_http_call_response to match the response to the request.
    callouts: RefCell<HashMap<u32, CallContext>>,
    llm_providers: Option<Rc<LlmProviders>>,
-    traces_queue: Arc<Mutex<VecDeque<TraceData>>>,
    overrides: Rc<Option<Overrides>>,
 }

@ -41,7 +32,6 @@ impl FilterContext {
            callouts: RefCell::new(HashMap::new()),
            metrics: Rc::new(Metrics::new()),
            llm_providers: None,
-            traces_queue: Arc::new(Mutex::new(VecDeque::new())),
            overrides: Rc::new(None),
        }
    }
@ -95,7 +85,6 @@ impl RootContext for FilterContext {
                    .as_ref()
                    .expect("LLM Providers must exist when Streams are being created"),
            ),
-            Arc::clone(&self.traces_queue),
            Rc::clone(&self.overrides),
        )))
    }
@ -108,34 +97,6 @@ impl RootContext for FilterContext {
        self.set_tick_period(Duration::from_secs(1));
        true
    }
-
-    fn on_tick(&mut self) {
-        let _ = self.traces_queue.try_lock().map(|mut traces_queue| {
-            while let Some(trace) = traces_queue.pop_front() {
-                let trace_str = serde_json::to_string(&trace).unwrap();
-                trace!("trace details: {}", trace_str);
-                let call_args = CallArgs::new(
-                    OTEL_COLLECTOR_HTTP,
-                    OTEL_POST_PATH,
-                    vec![
-                        (":method", http::Method::POST.as_str()),
-                        (":path", OTEL_POST_PATH),
-                        (":authority", OTEL_COLLECTOR_HTTP),
-                        ("content-type", "application/json"),
-                    ],
-                    Some(trace_str.as_bytes()),
-                    vec![],
-                    Duration::from_secs(60),
-                );
-                if let Err(error) = self.http_call(call_args, CallContext {}) {
-                    warn!(
-                        "failed to schedule http call to otel-collector: {:?}",
-                        error
-                    );
-                }
-            }
-        });
-    }
 }

 impl Context for FilterContext {
--- a/crates/llm_gateway/src/stream_context.rs
+++ b/crates/llm_gateway/src/stream_context.rs
@ -4,10 +4,8 @@ use log::{debug, info, warn};
 use proxy_wasm::hostcalls::get_current_time;
 use proxy_wasm::traits::*;
 use proxy_wasm::types::*;
-use std::collections::VecDeque;
 use std::num::NonZero;
 use std::rc::Rc;
-use std::sync::{Arc, Mutex};
 use std::time::{Duration, SystemTime, UNIX_EPOCH};

 use crate::metrics::Metrics;
@ -20,7 +18,6 @@ use common::errors::ServerError;
 use common::llm_providers::LlmProviders;
 use common::ratelimit::Header;
 use common::stats::{IncrementingMetric, RecordingMetric};
-use common::tracing::{Event, Span, TraceData, Traceparent};
 use common::{ratelimit, routing, tokenizer};
 use hermesllm::apis::streaming_shapes::amazon_bedrock_binary_frame::BedrockBinaryFrameDecoder;
 use hermesllm::apis::streaming_shapes::sse::{
@ -51,7 +48,6 @@ pub struct StreamContext {
    ttft_time: Option<u128>,
    traceparent: Option<String>,
    request_body_sent_time: Option<u128>,
-    traces_queue: Arc<Mutex<VecDeque<TraceData>>>,
    overrides: Rc<Option<Overrides>>,
    user_message: Option<String>,
    upstream_status_code: Option<StatusCode>,
@ -65,7 +61,6 @@ impl StreamContext {
    pub fn new(
        metrics: Rc<Metrics>,
        llm_providers: Rc<LlmProviders>,
-        traces_queue: Arc<Mutex<VecDeque<TraceData>>>,
        overrides: Rc<Option<Overrides>>,
    ) -> Self {
        StreamContext {
@ -83,7 +78,6 @@ impl StreamContext {
            ttft_duration: None,
            traceparent: None,
            ttft_time: None,
-            traces_queue,
            request_body_sent_time: None,
            user_message: None,
            upstream_status_code: None,
@ -333,68 +327,6 @@ impl StreamContext {
        self.metrics
            .output_sequence_length
            .record(self.response_tokens as u64);
-
-        if let Some(traceparent) = self.traceparent.as_ref() {
-            let current_time_ns = current_time_ns();
-
-            match Traceparent::try_from(traceparent.to_string()) {
-                Err(e) => {
-                    warn!("traceparent header is invalid: {}", e);
-                }
-                Ok(traceparent) => {
-                    let service_name = match &self.resolved_api {
-                        Some(api) => {
-                            let api_display = api.to_string();
-                            format!("archgw.{}", api_display)
-                        }
-                        None => "archgw".to_string(),
-                    };
-
-                    let mut trace_data =
-                        common::tracing::TraceData::new_with_service_name(service_name);
-                    let mut llm_span = Span::new(
-                        self.llm_provider().name.to_string(),
-                        Some(traceparent.trace_id),
-                        Some(traceparent.parent_id),
-                        self.request_body_sent_time.unwrap(),
-                        current_time_ns,
-                    );
-                    llm_span
-                        .add_attribute("model".to_string(), self.llm_provider().name.to_string());
-
-                    if let Some(user_message) = &self.user_message {
-                        llm_span.add_attribute("message".to_string(), user_message.clone());
-                    }
-
-                    // Add HTTP attributes
-                    if let Some(method) = &self.http_method {
-                        llm_span.add_attribute("http.method".to_string(), method.clone());
-                    }
-                    if let Some(protocol) = &self.http_protocol {
-                        llm_span.add_attribute("http.protocol".to_string(), protocol.clone());
-                    }
-                    if let Some(status_code) = &self.upstream_status_code {
-                        llm_span.add_attribute(
-                            "http.status_code".to_string(),
-                            status_code.as_u16().to_string(),
-                        );
-                    }
-
-                    // Add request ID attribute
-                    llm_span
-                        .add_attribute("http.request_id".to_string(), self.request_identifier());
-
-                    if self.ttft_time.is_some() {
-                        llm_span.add_event(Event::new(
-                            "time_to_first_token".to_string(),
-                            self.ttft_time.unwrap(),
-                        ));
-                    }
-                    trace_data.add_span(llm_span);
-                    self.traces_queue.lock().unwrap().push_back(trace_data);
-                }
-            };
-        }
    }

    fn read_raw_response_body(&mut self, body_size: usize) -> Result<Vec<u8>, Action> {