diff --git a/crates/llm_gateway/src/filter_context.rs b/crates/llm_gateway/src/filter_context.rs index 2a4d47a9..9a34fe98 100644 --- a/crates/llm_gateway/src/filter_context.rs +++ b/crates/llm_gateway/src/filter_context.rs @@ -18,6 +18,8 @@ pub struct WasmMetrics { pub active_http_calls: Gauge, pub ratelimited_rq: Counter, pub time_to_first_token: Histogram, + pub time_per_output_token: Histogram, + pub tokens_per_second: Histogram, pub request_latency: Histogram, pub output_sequence_length: Histogram, pub input_sequence_length: Histogram, @@ -29,6 +31,8 @@ impl WasmMetrics { active_http_calls: Gauge::new(String::from("active_http_calls")), ratelimited_rq: Counter::new(String::from("ratelimited_rq")), time_to_first_token: Histogram::new(String::from("time_to_first_token")), + time_per_output_token: Histogram::new(String::from("time_per_output_token")), + tokens_per_second: Histogram::new(String::from("tokens_per_second")), request_latency: Histogram::new(String::from("request_latency")), output_sequence_length: Histogram::new(String::from("output_sequence_length")), input_sequence_length: Histogram::new(String::from("input_sequence_length")), diff --git a/crates/llm_gateway/src/stream_context.rs b/crates/llm_gateway/src/stream_context.rs index 3b556a44..38266f72 100644 --- a/crates/llm_gateway/src/stream_context.rs +++ b/crates/llm_gateway/src/stream_context.rs @@ -296,6 +296,17 @@ impl HttpContext for StreamContext { debug!("Total latency: {} milliseconds", duration_ms); // Record the latency to the latency histogram self.metrics.request_latency.record(duration_ms as u64); + + // Compute the time per output token + let tpot = duration_ms as u64 / self.response_tokens as u64; + + debug!("Time per output token: {} milliseconds", tpot); + // Record the time per output token + self.metrics.time_per_output_token.record(tpot); + + debug!("Tokens per second: {}", 1000 / tpot); + // Record the tokens per second + self.metrics.tokens_per_second.record(1000 / tpot); } Err(e) => { warn!("SystemTime error: {:?}", e); diff --git a/crates/llm_gateway/tests/integration.rs b/crates/llm_gateway/tests/integration.rs index ea65bfa0..7107b4d2 100644 --- a/crates/llm_gateway/tests/integration.rs +++ b/crates/llm_gateway/tests/integration.rs @@ -75,6 +75,8 @@ fn setup_filter(module: &mut Tester, config: &str) -> i32 { .expect_metric_creation(MetricType::Gauge, "active_http_calls") .expect_metric_creation(MetricType::Counter, "ratelimited_rq") .expect_metric_creation(MetricType::Histogram, "time_to_first_token") + .expect_metric_creation(MetricType::Histogram, "time_per_output_token") + .expect_metric_creation(MetricType::Histogram, "tokens_per_second") .expect_metric_creation(MetricType::Histogram, "request_latency") .expect_metric_creation(MetricType::Histogram, "output_sequence_length") .expect_metric_creation(MetricType::Histogram, "input_sequence_length") diff --git a/demos/shared/grafana/dashboards/envoy_overview.json b/demos/shared/grafana/dashboards/envoy_overview.json index 5a77e075..4089dade 100644 --- a/demos/shared/grafana/dashboards/envoy_overview.json +++ b/demos/shared/grafana/dashboards/envoy_overview.json @@ -63,9 +63,7 @@ "orientation": "auto", "percentChangeColorMode": "standard", "reduceOptions": { - "calcs": [ - "lastNotNull" - ], + "calcs": ["lastNotNull"], "fields": "", "values": false }, @@ -73,7 +71,7 @@ "textMode": "auto", "wideLayout": true }, - "pluginVersion": "11.3.0+security-01", + "pluginVersion": "11.3.0", "targets": [ { "datasource": { @@ -93,7 +91,7 @@ "useBackend": false } ], - "title": "# of Completd Requests", + "title": "# of Completed Requests", "type": "stat" }, { @@ -188,7 +186,7 @@ "sort": "none" } }, - "pluginVersion": "11.3.0+security-01", + "pluginVersion": "11.3.0", "targets": [ { "disableTextWrap": false, @@ -298,7 +296,7 @@ "sort": "none" } }, - "pluginVersion": "11.3.0+security-01", + "pluginVersion": "11.3.0", "targets": [ { "datasource": { @@ -412,11 +410,11 @@ "sort": "none" } }, - "pluginVersion": "11.3.0+security-01", + "pluginVersion": "11.3.0", "targets": [ { "disableTextWrap": false, - "editorMode": "builder", + "editorMode": "code", "expr": "histogram_quantile(0.5, sum by(le) (rate(time_to_first_token_bucket[1h])))", "fullMetaSearch": false, "includeNullMetadata": false, @@ -492,12 +490,12 @@ { "matcher": { "id": "byName", - "options": "histogram_quantile(0.5, sum(rate(latency_bucket[60m])) by (le))" + "options": "histogram_quantile(0.5, sum by(le) (rate(request_latency_bucket[1h])))" }, "properties": [ { "id": "displayName", - "value": "Total Request Latency" + "value": "Request Latency" } ] }, @@ -534,37 +532,246 @@ "sort": "none" } }, - "pluginVersion": "11.3.0+security-01", + "pluginVersion": "11.3.0", "targets": [ { "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" }, - "editorMode": "code", - "expr": "histogram_quantile(0.5, sum(rate(latency_bucket[60m])) by (le))", + "disableTextWrap": false, + "editorMode": "builder", + "expr": "histogram_quantile(0.5, sum by(le) (rate(request_latency_bucket[1h])))", + "fullMetaSearch": false, "hide": false, + "includeNullMetadata": false, "instant": false, "legendFormat": "__auto", "range": true, - "refId": "B" - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "editorMode": "code", - "expr": "histogram_quantile(0.5, sum(rate(time_to_first_token_bucket[60m])) by (le))", - "hide": false, - "instant": false, - "legendFormat": "__auto", - "range": true, - "refId": "A" + "refId": "A", + "useBackend": false } ], "title": "request latency (p50)", "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "histogram_quantile(0.5, sum by(le) (rate(time_per_output_token_bucket[1h])))" + }, + "properties": [ + { + "id": "displayName", + "value": "Time per Output Token" + } + ] + } + ] + }, + "gridPos": { + "h": 13, + "w": 12, + "x": 0, + "y": 23 + }, + "id": 9, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "11.3.0", + "targets": [ + { + "disableTextWrap": false, + "editorMode": "builder", + "expr": "histogram_quantile(0.5, sum by(le) (rate(time_per_output_token_bucket[1h])))", + "fullMetaSearch": false, + "includeNullMetadata": false, + "legendFormat": "__auto", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "Time per Output Token (50p)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "histogram_quantile(0.5, sum by(le) (rate(tokens_per_second_bucket[1h])))" + }, + "properties": [ + { + "id": "displayName", + "value": "Tokens per Second" + } + ] + } + ] + }, + "gridPos": { + "h": 13, + "w": 12, + "x": 12, + "y": 23 + }, + "id": 10, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "11.3.0", + "targets": [ + { + "disableTextWrap": false, + "editorMode": "builder", + "expr": "histogram_quantile(0.5, sum by(le) (rate(tokens_per_second_bucket[1h])))", + "fullMetaSearch": false, + "includeNullMetadata": false, + "legendFormat": "__auto", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "Tokens per Second(50p)", + "type": "timeseries" } ], "preload": false,