add in tpot and tokens per second

This commit is contained in:
aayushwhiz 2024-11-14 13:49:08 -08:00
parent 9eeb790c7f
commit 1e6d25a90e
2 changed files with 15 additions and 0 deletions

View file

@ -18,6 +18,8 @@ pub struct WasmMetrics {
pub active_http_calls: Gauge,
pub ratelimited_rq: Counter,
pub time_to_first_token: Histogram,
pub time_per_output_token: Histogram,
pub tokens_per_second: Histogram,
pub request_latency: Histogram,
pub output_sequence_length: Histogram,
pub input_sequence_length: Histogram,
@ -29,6 +31,8 @@ impl WasmMetrics {
active_http_calls: Gauge::new(String::from("active_http_calls")),
ratelimited_rq: Counter::new(String::from("ratelimited_rq")),
time_to_first_token: Histogram::new(String::from("time_to_first_token")),
time_per_output_token: Histogram::new(String::from("time_per_output_token")),
tokens_per_second: Histogram::new(String::from("tokens_per_second")),
request_latency: Histogram::new(String::from("request_latency")),
output_sequence_length: Histogram::new(String::from("output_sequence_length")),
input_sequence_length: Histogram::new(String::from("input_sequence_length")),

View file

@ -296,6 +296,17 @@ impl HttpContext for StreamContext {
debug!("Total latency: {} milliseconds", duration_ms);
// Record the latency to the latency histogram
self.metrics.request_latency.record(duration_ms as u64);
// Compute the time per output token
let tpot = (duration_ms as u64
- self.ttft_duration.unwrap().as_millis() as u64)
/ self.response_tokens as u64;
// Record the time per output token
self.metrics.time_per_output_token.record(tpot);
// Record the tokens per second
self.metrics.tokens_per_second.record(1000 / tpot);
}
Err(e) => {
warn!("SystemTime error: {:?}", e);