From 1e6d25a90e5292016dc6be92826425fea466ce94 Mon Sep 17 00:00:00 2001
From: aayushwhiz <aayushwhiz@gmail.com>
Date: Thu, 14 Nov 2024 13:49:08 -0800
Subject: [PATCH] add in tpot and tokens per second

---
 crates/llm_gateway/src/filter_context.rs |  4 ++++
 crates/llm_gateway/src/stream_context.rs | 11 +++++++++++
 2 files changed, 15 insertions(+)

diff --git a/crates/llm_gateway/src/filter_context.rs b/crates/llm_gateway/src/filter_context.rs
index 2a4d47a9..9a34fe98 100644
--- a/crates/llm_gateway/src/filter_context.rs
+++ b/crates/llm_gateway/src/filter_context.rs
@@ -18,6 +18,8 @@ pub struct WasmMetrics {
     pub active_http_calls: Gauge,
     pub ratelimited_rq: Counter,
     pub time_to_first_token: Histogram,
+    pub time_per_output_token: Histogram,
+    pub tokens_per_second: Histogram,
     pub request_latency: Histogram,
     pub output_sequence_length: Histogram,
     pub input_sequence_length: Histogram,
@@ -29,6 +31,8 @@ impl WasmMetrics {
             active_http_calls: Gauge::new(String::from("active_http_calls")),
             ratelimited_rq: Counter::new(String::from("ratelimited_rq")),
             time_to_first_token: Histogram::new(String::from("time_to_first_token")),
+            time_per_output_token: Histogram::new(String::from("time_per_output_token")),
+            tokens_per_second: Histogram::new(String::from("tokens_per_second")),
             request_latency: Histogram::new(String::from("request_latency")),
             output_sequence_length: Histogram::new(String::from("output_sequence_length")),
             input_sequence_length: Histogram::new(String::from("input_sequence_length")),
diff --git a/crates/llm_gateway/src/stream_context.rs b/crates/llm_gateway/src/stream_context.rs
index 3b556a44..4cf2e9c5 100644
--- a/crates/llm_gateway/src/stream_context.rs
+++ b/crates/llm_gateway/src/stream_context.rs
@@ -296,6 +296,17 @@ impl HttpContext for StreamContext {
                         debug!("Total latency: {} milliseconds", duration_ms);
                         // Record the latency to the latency histogram
                         self.metrics.request_latency.record(duration_ms as u64);
+
+                        // Compute the time per output token
+                        let tpot = (duration_ms as u64
+                            - self.ttft_duration.unwrap().as_millis() as u64)
+                            / self.response_tokens as u64;
+
+                        // Record the time per output token
+                        self.metrics.time_per_output_token.record(tpot);
+
+                        // Record the tokens per second
+                        self.metrics.tokens_per_second.record(1000 / tpot);
                     }
                     Err(e) => {
                         warn!("SystemTime error: {:?}", e);