From 1e6d25a90e5292016dc6be92826425fea466ce94 Mon Sep 17 00:00:00 2001 From: aayushwhiz Date: Thu, 14 Nov 2024 13:49:08 -0800 Subject: [PATCH] add in tpot and tokens per second --- crates/llm_gateway/src/filter_context.rs | 4 ++++ crates/llm_gateway/src/stream_context.rs | 11 +++++++++++ 2 files changed, 15 insertions(+) diff --git a/crates/llm_gateway/src/filter_context.rs b/crates/llm_gateway/src/filter_context.rs index 2a4d47a9..9a34fe98 100644 --- a/crates/llm_gateway/src/filter_context.rs +++ b/crates/llm_gateway/src/filter_context.rs @@ -18,6 +18,8 @@ pub struct WasmMetrics { pub active_http_calls: Gauge, pub ratelimited_rq: Counter, pub time_to_first_token: Histogram, + pub time_per_output_token: Histogram, + pub tokens_per_second: Histogram, pub request_latency: Histogram, pub output_sequence_length: Histogram, pub input_sequence_length: Histogram, @@ -29,6 +31,8 @@ impl WasmMetrics { active_http_calls: Gauge::new(String::from("active_http_calls")), ratelimited_rq: Counter::new(String::from("ratelimited_rq")), time_to_first_token: Histogram::new(String::from("time_to_first_token")), + time_per_output_token: Histogram::new(String::from("time_per_output_token")), + tokens_per_second: Histogram::new(String::from("tokens_per_second")), request_latency: Histogram::new(String::from("request_latency")), output_sequence_length: Histogram::new(String::from("output_sequence_length")), input_sequence_length: Histogram::new(String::from("input_sequence_length")), diff --git a/crates/llm_gateway/src/stream_context.rs b/crates/llm_gateway/src/stream_context.rs index 3b556a44..4cf2e9c5 100644 --- a/crates/llm_gateway/src/stream_context.rs +++ b/crates/llm_gateway/src/stream_context.rs @@ -296,6 +296,17 @@ impl HttpContext for StreamContext { debug!("Total latency: {} milliseconds", duration_ms); // Record the latency to the latency histogram self.metrics.request_latency.record(duration_ms as u64); + + // Compute the time per output token + let tpot = (duration_ms as u64 + - self.ttft_duration.unwrap().as_millis() as u64) + / self.response_tokens as u64; + + // Record the time per output token + self.metrics.time_per_output_token.record(tpot); + + // Record the tokens per second + self.metrics.tokens_per_second.record(1000 / tpot); } Err(e) => { warn!("SystemTime error: {:?}", e);