From bf39fecd6d5816d036034feedb94eb47a13eb161 Mon Sep 17 00:00:00 2001 From: aayushwhiz Date: Wed, 6 Nov 2024 16:32:45 -0800 Subject: [PATCH] add in tpot stat setup check for first token as well as time per token after that --- crates/llm_gateway/src/filter_context.rs | 2 + crates/llm_gateway/src/stream_context.rs | 52 +++++++++++++++++++++++- 2 files changed, 52 insertions(+), 2 deletions(-) diff --git a/crates/llm_gateway/src/filter_context.rs b/crates/llm_gateway/src/filter_context.rs index b603103a..6719cbe1 100644 --- a/crates/llm_gateway/src/filter_context.rs +++ b/crates/llm_gateway/src/filter_context.rs @@ -18,6 +18,7 @@ pub struct WasmMetrics { pub active_http_calls: Gauge, pub ratelimited_rq: Counter, pub time_to_first_token: Histogram, + pub time_per_output_token: Histogram, } impl WasmMetrics { @@ -26,6 +27,7 @@ impl WasmMetrics { active_http_calls: Gauge::new(String::from("active_http_calls")), ratelimited_rq: Counter::new(String::from("ratelimited_rq")), time_to_first_token: Histogram::new(String::from("time_to_first_token")), + time_per_output_token: Histogram::new(String::from("time_per_output_token")), } } } diff --git a/crates/llm_gateway/src/stream_context.rs b/crates/llm_gateway/src/stream_context.rs index 43d9f0aa..701d7dd0 100644 --- a/crates/llm_gateway/src/stream_context.rs +++ b/crates/llm_gateway/src/stream_context.rs @@ -38,6 +38,8 @@ pub struct StreamContext { start_time: Option, ttft_recorded: bool, ttft_duration: Option, // Store the duration directly + first_token_processed: bool, + last_token_time: Option, } impl StreamContext { @@ -55,6 +57,8 @@ impl StreamContext { start_time: None, ttft_recorded: false, ttft_duration: None, + first_token_processed: false, + last_token_time: None, } } fn llm_provider(&self) -> &LlmProvider { @@ -354,14 +358,14 @@ impl HttpContext for StreamContext { }; self.response_tokens += token_count; - // Compute TFT if not already recorded + // Compute TTFT if not already recorded if !self.ttft_recorded { if let Some(start_time) = self.start_time { match get_current_time() { Ok(current_time) => match current_time.duration_since(start_time) { Ok(duration) => { let duration_ms = duration.as_millis(); - debug!("Time to First Token (TFT): {} milliseconds", duration_ms); + debug!("Time to First Token (TTFT): {} milliseconds", duration_ms); self.ttft_duration = Some(duration); self.metrics.time_to_first_token.record(duration_ms as u64); } @@ -378,6 +382,50 @@ impl HttpContext for StreamContext { warn!("Start time was not recorded"); } } + // Check if first token was not processed yet, and if there are tokens in the response. + // If so, set the last_token_time to now and set first_token_processed to true + if !self.first_token_processed && token_count > 0 { + self.first_token_processed = true; + // Set last_token_time to now + match get_current_time() { + Ok(current_time) => { + self.last_token_time = Some(current_time); + } + Err(e) => { + warn!("Failed to get current time: {:?}", e); + } + } + } else if self.first_token_processed && token_count == 0 { + if let Some(last_token_time) = self.last_token_time { + match get_current_time() { + Ok(current_time) => { + // record the time for the current output token and calculate the time per output token + match current_time.duration_since(last_token_time) { + Ok(duration) => { + // Convert the duration to milliseconds + let duration_ms = duration.as_millis(); + debug!( + "Time for Current Output Token: {} milliseconds", + duration_ms + ); + // Record TPOT metric for historgram + self.metrics + .time_per_output_token + .record(duration_ms as u64); + } + Err(e) => { + warn!("SystemTime error: {:?}", e); + } + } + // Set last_token_time to now + self.last_token_time = Some(current_time); + } + Err(e) => { + warn!("Failed to get current time: {:?}", e); + } + } + } + } } else { debug!("non streaming response"); let chat_completions_response: ChatCompletionsResponse =