diff --git a/crates/llm_gateway/src/stream_context.rs b/crates/llm_gateway/src/stream_context.rs index 4cf2e9c5..253d8cea 100644 --- a/crates/llm_gateway/src/stream_context.rs +++ b/crates/llm_gateway/src/stream_context.rs @@ -302,9 +302,11 @@ impl HttpContext for StreamContext { - self.ttft_duration.unwrap().as_millis() as u64) / self.response_tokens as u64; + debug!("Time per output token: {} milliseconds", tpot); // Record the time per output token self.metrics.time_per_output_token.record(tpot); + debug!("Tokens per second: {}", 1000 / tpot); // Record the tokens per second self.metrics.tokens_per_second.record(1000 / tpot); } diff --git a/crates/llm_gateway/tests/integration.rs b/crates/llm_gateway/tests/integration.rs index ea65bfa0..7107b4d2 100644 --- a/crates/llm_gateway/tests/integration.rs +++ b/crates/llm_gateway/tests/integration.rs @@ -75,6 +75,8 @@ fn setup_filter(module: &mut Tester, config: &str) -> i32 { .expect_metric_creation(MetricType::Gauge, "active_http_calls") .expect_metric_creation(MetricType::Counter, "ratelimited_rq") .expect_metric_creation(MetricType::Histogram, "time_to_first_token") + .expect_metric_creation(MetricType::Histogram, "time_per_output_token") + .expect_metric_creation(MetricType::Histogram, "tokens_per_second") .expect_metric_creation(MetricType::Histogram, "request_latency") .expect_metric_creation(MetricType::Histogram, "output_sequence_length") .expect_metric_creation(MetricType::Histogram, "input_sequence_length")