Add in Latency and output_sequence_length

added latency histogram and ouput sequency length histogram to the wasm metrics. Updated stream context so that When the end_stream is recieved, it stores the time since request was sent as well as total number of tokens up till that point.
2026-06-29 15:49:40 +02:00 · 2024-11-06 22:58:14 -08:00 · 2024-11-06 22:58:14 -08:00 · 8fb5c4eceb
commit 8fb5c4eceb
parent 840b6a0e3e
2 changed files with 33 additions and 2 deletions
--- a/crates/llm_gateway/src/filter_context.rs
+++ b/crates/llm_gateway/src/filter_context.rs
@ -19,6 +19,9 @@ pub struct WasmMetrics {
    pub ratelimited_rq: Counter,
    pub time_to_first_token: Histogram,
    pub time_per_output_token: Histogram,
    pub latency: Histogram,
    pub output_sequence_length: Histogram,
    // TODO: Add Input Sequence Length
 }
 impl WasmMetrics {
@ -28,6 +31,8 @@ impl WasmMetrics {
            ratelimited_rq: Counter::new(String::from("ratelimited_rq")),
            time_to_first_token: Histogram::new(String::from("time_to_first_token")),
            time_per_output_token: Histogram::new(String::from("time_per_output_token")),
            latency: Histogram::new(String::from("latency")),
            output_sequence_length: Histogram::new(String::from("output_sequence_length")),
        }
    }
 }
--- a/crates/llm_gateway/src/stream_context.rs
+++ b/crates/llm_gateway/src/stream_context.rs
@ -269,6 +269,32 @@ impl HttpContext for StreamContext {
        let body = if self.streaming_response {
            if end_of_stream && body_size == 0 {
                // All streaming responses end with bytes=0 and end_stream=true
                // Record the latency for the request
                if let Some(start_time) = self.start_time {
                    match get_current_time() {
                        Ok(current_time) => match current_time.duration_since(start_time) {
                            Ok(duration) => {
                                // Convert the duration to milliseconds
                                let duration_ms = duration.as_millis();
                                debug!("Total latency: {} milliseconds", duration_ms);
                                // Record the latency to the latency histogram
                                self.metrics.latency.record(duration_ms as u64);
                            }
                            Err(e) => {
                                warn!("SystemTime error: {:?}", e);
                            }
                        },
                        Err(e) => {
                            warn!("Failed to get current time: {:?}", e);
                        }
                    }
                }
                // Record the output sequence length
                self.metrics
                    .output_sequence_length
                    .record(self.response_tokens as u64);
                return Action::Continue;
            }
            let chunk_start = 0;
@ -405,8 +431,8 @@ impl HttpContext for StreamContext {
                                    // Convert the duration to milliseconds
                                    let duration_ms = duration.as_millis();
                                    debug!(
-                                        "Time for Current Output Tokens: {} milliseconds",
+                                        "Time for Current Output Token: {} milliseconds",
-                                        duration_ms
+                                        duration_ms as u64 / token_count as u64
                                    );
                                    // Record TPOT metric for historgram
                                    self.metrics