Merge branch 'main' into adil/llm_tracing

This commit is contained in:
Adil Hafeez 2024-11-14 22:13:20 -08:00
commit 9e2fd2ee58
5 changed files with 264 additions and 14 deletions

View file

@ -1,4 +1,6 @@
![alt text](image.png)
<a href="https://www.producthunt.com/posts/arch-3?embed=true&utm_source=badge-top-post-badge&utm_medium=badge&utm_souce=badge-arch&#0045;3" target="_blank"><img src="https://api.producthunt.com/widgets/embed-image/v1/top-post-badge.svg?post_id=565761&theme=light&period=daily" alt="Arch - Build&#0032;fast&#0044;&#0032;hyper&#0045;personalized&#0032;agents&#0032;with&#0032;intelligent&#0032;infra | Product Hunt" style="width: 250px; height: 54px;" width="250" height="54" /></a>
[![pre-commit](https://github.com/katanemo/arch/actions/workflows/pre-commit.yml/badge.svg)](https://github.com/katanemo/arch/actions/workflows/pre-commit.yml)
[![rust tests (prompt and llm gateway)](https://github.com/katanemo/arch/actions/workflows/rust_tests.yml/badge.svg)](https://github.com/katanemo/arch/actions/workflows/rust_tests.yml)
@ -53,9 +55,7 @@ Make sure you have following utilities installed before proceeding further,
1. [Docker System](https://docs.docker.com/get-started/get-docker/) (v24)
2. [Docker compose](https://docs.docker.com/compose/install/) (v2.29)
3. [Python](https://www.python.org/downloads/) (v3.10)
4. [Poetry](https://python-poetry.org/docs/#installing-with-the-official-installer) (v1.8.3)
*Note: Poetry is needed for local development*
4. [Poetry](https://python-poetry.org/docs/#installing-with-the-official-installer) (v1.8.3. *Note: only needed for local development*)
```console

View file

@ -18,6 +18,8 @@ pub struct WasmMetrics {
pub active_http_calls: Gauge,
pub ratelimited_rq: Counter,
pub time_to_first_token: Histogram,
pub time_per_output_token: Histogram,
pub tokens_per_second: Histogram,
pub request_latency: Histogram,
pub output_sequence_length: Histogram,
pub input_sequence_length: Histogram,
@ -29,6 +31,8 @@ impl WasmMetrics {
active_http_calls: Gauge::new(String::from("active_http_calls")),
ratelimited_rq: Counter::new(String::from("ratelimited_rq")),
time_to_first_token: Histogram::new(String::from("time_to_first_token")),
time_per_output_token: Histogram::new(String::from("time_per_output_token")),
tokens_per_second: Histogram::new(String::from("tokens_per_second")),
request_latency: Histogram::new(String::from("request_latency")),
output_sequence_length: Histogram::new(String::from("output_sequence_length")),
input_sequence_length: Histogram::new(String::from("input_sequence_length")),

View file

@ -311,6 +311,17 @@ impl HttpContext for StreamContext {
debug!("Total latency: {} milliseconds", duration_ms);
// Record the latency to the latency histogram
self.metrics.request_latency.record(duration_ms as u64);
// Compute the time per output token
let tpot = duration_ms as u64 / self.response_tokens as u64;
debug!("Time per output token: {} milliseconds", tpot);
// Record the time per output token
self.metrics.time_per_output_token.record(tpot);
debug!("Tokens per second: {}", 1000 / tpot);
// Record the tokens per second
self.metrics.tokens_per_second.record(1000 / tpot);
}
Err(e) => {
warn!("SystemTime error: {:?}", e);

View file

@ -77,6 +77,8 @@ fn setup_filter(module: &mut Tester, config: &str) -> i32 {
.expect_metric_creation(MetricType::Gauge, "active_http_calls")
.expect_metric_creation(MetricType::Counter, "ratelimited_rq")
.expect_metric_creation(MetricType::Histogram, "time_to_first_token")
.expect_metric_creation(MetricType::Histogram, "time_per_output_token")
.expect_metric_creation(MetricType::Histogram, "tokens_per_second")
.expect_metric_creation(MetricType::Histogram, "request_latency")
.expect_metric_creation(MetricType::Histogram, "output_sequence_length")
.expect_metric_creation(MetricType::Histogram, "input_sequence_length")

View file

@ -63,9 +63,7 @@
"orientation": "auto",
"percentChangeColorMode": "standard",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"calcs": ["lastNotNull"],
"fields": "",
"values": false
},
@ -73,7 +71,7 @@
"textMode": "auto",
"wideLayout": true
},
"pluginVersion": "11.3.0+security-01",
"pluginVersion": "11.3.0",
"targets": [
{
"datasource": {
@ -93,7 +91,7 @@
"useBackend": false
}
],
"title": "# of Completd Requests",
"title": "# of Completed Requests",
"type": "stat"
},
{
@ -188,7 +186,7 @@
"sort": "none"
}
},
"pluginVersion": "11.3.0+security-01",
"pluginVersion": "11.3.0",
"targets": [
{
"disableTextWrap": false,
@ -298,7 +296,7 @@
"sort": "none"
}
},
"pluginVersion": "11.3.0+security-01",
"pluginVersion": "11.3.0",
"targets": [
{
"datasource": {
@ -412,12 +410,16 @@
"sort": "none"
}
},
"pluginVersion": "11.3.0+security-01",
"pluginVersion": "11.3.0",
"targets": [
{
"disableTextWrap": false,
"editorMode": "code",
<<<<<<< HEAD
"expr": "histogram_quantile(0.9, sum by(le) (rate(time_to_first_token_bucket[5m])))",
=======
"expr": "histogram_quantile(0.5, sum by(le) (rate(time_to_first_token_bucket[1h])))",
>>>>>>> main
"fullMetaSearch": false,
"includeNullMetadata": false,
"legendFormat": "__auto",
@ -492,12 +494,12 @@
{
"matcher": {
"id": "byName",
"options": "histogram_quantile(0.5, sum(rate(latency_bucket[60m])) by (le))"
"options": "histogram_quantile(0.5, sum by(le) (rate(request_latency_bucket[1h])))"
},
"properties": [
{
"id": "displayName",
"value": "Total Request Latency"
"value": "Request Latency"
}
]
},
@ -534,24 +536,255 @@
"sort": "none"
}
},
"pluginVersion": "11.3.0+security-01",
"pluginVersion": "11.3.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "PBFA97CFB590B2093"
},
<<<<<<< HEAD
"editorMode": "code",
"expr": "histogram_quantile(0.9, sum(rate(request_latency_bucket[60m])) by (le))",
=======
"disableTextWrap": false,
"editorMode": "builder",
"expr": "histogram_quantile(0.5, sum by(le) (rate(request_latency_bucket[1h])))",
"fullMetaSearch": false,
>>>>>>> main
"hide": false,
"includeNullMetadata": false,
"instant": false,
"legendFormat": "__auto",
"range": true,
<<<<<<< HEAD
"refId": "B"
=======
"refId": "A",
"useBackend": false
>>>>>>> main
}
],
"title": "request latency (p90)",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "PBFA97CFB590B2093"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"barWidthFactor": 0.6,
"drawStyle": "line",
"fillOpacity": 0,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "auto",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
}
},
"overrides": [
{
"matcher": {
"id": "byName",
"options": "histogram_quantile(0.5, sum by(le) (rate(time_per_output_token_bucket[1h])))"
},
"properties": [
{
"id": "displayName",
"value": "Time per Output Token"
}
]
}
]
},
"gridPos": {
"h": 13,
"w": 12,
"x": 0,
"y": 23
},
"id": 9,
"options": {
"legend": {
"calcs": [],
"displayMode": "list",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"mode": "single",
"sort": "none"
}
},
"pluginVersion": "11.3.0",
"targets": [
{
"disableTextWrap": false,
"editorMode": "builder",
"expr": "histogram_quantile(0.5, sum by(le) (rate(time_per_output_token_bucket[1h])))",
"fullMetaSearch": false,
"includeNullMetadata": false,
"legendFormat": "__auto",
"range": true,
"refId": "A",
"useBackend": false
}
],
"title": "Time per Output Token (50p)",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "PBFA97CFB590B2093"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"barWidthFactor": 0.6,
"drawStyle": "line",
"fillOpacity": 0,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "auto",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
}
},
"overrides": [
{
"matcher": {
"id": "byName",
"options": "histogram_quantile(0.5, sum by(le) (rate(tokens_per_second_bucket[1h])))"
},
"properties": [
{
"id": "displayName",
"value": "Tokens per Second"
}
]
}
]
},
"gridPos": {
"h": 13,
"w": 12,
"x": 12,
"y": 23
},
"id": 10,
"options": {
"legend": {
"calcs": [],
"displayMode": "list",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"mode": "single",
"sort": "none"
}
},
"pluginVersion": "11.3.0",
"targets": [
{
"disableTextWrap": false,
"editorMode": "builder",
"expr": "histogram_quantile(0.5, sum by(le) (rate(tokens_per_second_bucket[1h])))",
"fullMetaSearch": false,
"includeNullMetadata": false,
"legendFormat": "__auto",
"range": true,
"refId": "A",
"useBackend": false
}
],
"title": "Tokens per Second(50p)",
"type": "timeseries"
}
],
"preload": false,