Add in tpot (#269)

* add in tpot and tokens per second * add in debug logs for new stats and update integration tests * update shared dashboard to include new stats
2026-05-15 11:02:39 +02:00 · 2024-11-14 15:03:08 -08:00 · 2024-11-14 15:03:08 -08:00 · 1d229cba8f
commit 1d229cba8f
parent 9eeb790c7f
4 changed files with 252 additions and 28 deletions
--- a/crates/llm_gateway/src/filter_context.rs
+++ b/crates/llm_gateway/src/filter_context.rs
@ -18,6 +18,8 @@ pub struct WasmMetrics {
    pub active_http_calls: Gauge,
    pub ratelimited_rq: Counter,
    pub time_to_first_token: Histogram,
+    pub time_per_output_token: Histogram,
+    pub tokens_per_second: Histogram,
    pub request_latency: Histogram,
    pub output_sequence_length: Histogram,
    pub input_sequence_length: Histogram,
@ -29,6 +31,8 @@ impl WasmMetrics {
            active_http_calls: Gauge::new(String::from("active_http_calls")),
            ratelimited_rq: Counter::new(String::from("ratelimited_rq")),
            time_to_first_token: Histogram::new(String::from("time_to_first_token")),
+            time_per_output_token: Histogram::new(String::from("time_per_output_token")),
+            tokens_per_second: Histogram::new(String::from("tokens_per_second")),
            request_latency: Histogram::new(String::from("request_latency")),
            output_sequence_length: Histogram::new(String::from("output_sequence_length")),
            input_sequence_length: Histogram::new(String::from("input_sequence_length")),
--- a/crates/llm_gateway/src/stream_context.rs
+++ b/crates/llm_gateway/src/stream_context.rs
@ -296,6 +296,17 @@ impl HttpContext for StreamContext {
                        debug!("Total latency: {} milliseconds", duration_ms);
                        // Record the latency to the latency histogram
                        self.metrics.request_latency.record(duration_ms as u64);
+
+                        // Compute the time per output token
+                        let tpot = duration_ms as u64 / self.response_tokens as u64;
+
+                        debug!("Time per output token: {} milliseconds", tpot);
+                        // Record the time per output token
+                        self.metrics.time_per_output_token.record(tpot);
+
+                        debug!("Tokens per second: {}", 1000 / tpot);
+                        // Record the tokens per second
+                        self.metrics.tokens_per_second.record(1000 / tpot);
                    }
                    Err(e) => {
                        warn!("SystemTime error: {:?}", e);