From 8fb5c4eceb0ae5e661cd6585d6c475fc56ae2f2f Mon Sep 17 00:00:00 2001 From: aayushwhiz Date: Wed, 6 Nov 2024 22:58:14 -0800 Subject: [PATCH] Add in Latency and output_sequence_length added latency histogram and ouput sequency length histogram to the wasm metrics. Updated stream context so that When the end_stream is recieved, it stores the time since request was sent as well as total number of tokens up till that point. --- crates/llm_gateway/src/filter_context.rs | 5 ++++ crates/llm_gateway/src/stream_context.rs | 30 ++++++++++++++++++++++-- 2 files changed, 33 insertions(+), 2 deletions(-) diff --git a/crates/llm_gateway/src/filter_context.rs b/crates/llm_gateway/src/filter_context.rs index 6719cbe1..c518e244 100644 --- a/crates/llm_gateway/src/filter_context.rs +++ b/crates/llm_gateway/src/filter_context.rs @@ -19,6 +19,9 @@ pub struct WasmMetrics { pub ratelimited_rq: Counter, pub time_to_first_token: Histogram, pub time_per_output_token: Histogram, + pub latency: Histogram, + pub output_sequence_length: Histogram, + // TODO: Add Input Sequence Length } impl WasmMetrics { @@ -28,6 +31,8 @@ impl WasmMetrics { ratelimited_rq: Counter::new(String::from("ratelimited_rq")), time_to_first_token: Histogram::new(String::from("time_to_first_token")), time_per_output_token: Histogram::new(String::from("time_per_output_token")), + latency: Histogram::new(String::from("latency")), + output_sequence_length: Histogram::new(String::from("output_sequence_length")), } } } diff --git a/crates/llm_gateway/src/stream_context.rs b/crates/llm_gateway/src/stream_context.rs index f5545928..298a9f68 100644 --- a/crates/llm_gateway/src/stream_context.rs +++ b/crates/llm_gateway/src/stream_context.rs @@ -269,6 +269,32 @@ impl HttpContext for StreamContext { let body = if self.streaming_response { if end_of_stream && body_size == 0 { + // All streaming responses end with bytes=0 and end_stream=true + // Record the latency for the request + if let Some(start_time) = self.start_time { + match get_current_time() { + Ok(current_time) => match current_time.duration_since(start_time) { + Ok(duration) => { + // Convert the duration to milliseconds + let duration_ms = duration.as_millis(); + debug!("Total latency: {} milliseconds", duration_ms); + // Record the latency to the latency histogram + self.metrics.latency.record(duration_ms as u64); + } + Err(e) => { + warn!("SystemTime error: {:?}", e); + } + }, + Err(e) => { + warn!("Failed to get current time: {:?}", e); + } + } + } + // Record the output sequence length + self.metrics + .output_sequence_length + .record(self.response_tokens as u64); + return Action::Continue; } let chunk_start = 0; @@ -405,8 +431,8 @@ impl HttpContext for StreamContext { // Convert the duration to milliseconds let duration_ms = duration.as_millis(); debug!( - "Time for Current Output Tokens: {} milliseconds", - duration_ms + "Time for Current Output Token: {} milliseconds", + duration_ms as u64 / token_count as u64 ); // Record TPOT metric for historgram self.metrics