plano/config/grafana/brightstaff_dashboard.json
Adil Hafeez 22f332f62d
Some checks are pending
CI / pre-commit (push) Waiting to run
CI / plano-tools-tests (push) Waiting to run
CI / native-smoke-test (push) Waiting to run
CI / docker-build (push) Waiting to run
CI / validate-config (push) Waiting to run
CI / security-scan (push) Blocked by required conditions
CI / test-prompt-gateway (push) Blocked by required conditions
CI / test-model-alias-routing (push) Blocked by required conditions
CI / test-responses-api-with-state (push) Blocked by required conditions
CI / e2e-plano-tests (3.10) (push) Blocked by required conditions
CI / e2e-plano-tests (3.11) (push) Blocked by required conditions
CI / e2e-plano-tests (3.12) (push) Blocked by required conditions
CI / e2e-plano-tests (3.13) (push) Blocked by required conditions
CI / e2e-plano-tests (3.14) (push) Blocked by required conditions
CI / e2e-demo-preference (push) Blocked by required conditions
CI / e2e-demo-currency (push) Blocked by required conditions
Publish docker image (latest) / build-arm64 (push) Waiting to run
Publish docker image (latest) / build-amd64 (push) Waiting to run
Publish docker image (latest) / create-manifest (push) Blocked by required conditions
Build and Deploy Documentation / build (push) Waiting to run
Add Prometheus metrics endpoint and Grafana dashboard for brightstaff (#904)
2026-04-22 11:19:10 -07:00

541 lines
19 KiB
JSON

{
"annotations": {
"list": [
{
"builtIn": 1,
"datasource": "-- Grafana --",
"enable": true,
"hide": true,
"iconColor": "rgba(0, 211, 255, 1)",
"name": "Annotations & Alerts",
"type": "dashboard"
}
]
},
"description": "RED, LLM upstream, routing service, and process metrics for brightstaff. Pair with Envoy admin metrics from cluster=bright_staff.",
"editable": true,
"fiscalYearStartMonth": 0,
"graphTooltip": 1,
"id": null,
"links": [],
"liveNow": false,
"panels": [
{
"collapsed": false,
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 },
"id": 100,
"panels": [],
"title": "HTTP RED",
"type": "row"
},
{
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
"fieldConfig": {
"defaults": {
"color": { "mode": "palette-classic" },
"custom": {
"axisLabel": "req/s",
"drawStyle": "line",
"fillOpacity": 10,
"lineWidth": 1,
"showPoints": "never"
},
"unit": "reqps"
}
},
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 1 },
"id": 1,
"options": {
"legend": { "displayMode": "table", "placement": "bottom", "showLegend": true },
"tooltip": { "mode": "multi" }
},
"targets": [
{
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
"expr": "sum by (handler) (rate(brightstaff_http_requests_total[1m]))",
"legendFormat": "{{handler}}",
"refId": "A"
}
],
"title": "Rate — brightstaff RPS by handler",
"type": "timeseries"
},
{
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
"description": "5xx fraction over 5m. Page-worthy when sustained above ~1%.",
"fieldConfig": {
"defaults": {
"color": { "mode": "thresholds" },
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "green", "value": null },
{ "color": "yellow", "value": 0.01 },
{ "color": "red", "value": 0.05 }
]
},
"unit": "percentunit"
}
},
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 1 },
"id": 2,
"options": {
"colorMode": "background",
"graphMode": "area",
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }
},
"targets": [
{
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
"expr": "sum(rate(brightstaff_http_requests_total{status_class=\"5xx\"}[5m])) / clamp_min(sum(rate(brightstaff_http_requests_total[5m])), 1)",
"legendFormat": "5xx rate",
"refId": "A"
}
],
"title": "Errors — brightstaff 5xx rate",
"type": "stat"
},
{
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
"description": "p50/p95/p99 by handler, computed from histogram buckets over 5m.",
"fieldConfig": {
"defaults": {
"color": { "mode": "palette-classic" },
"custom": { "drawStyle": "line", "fillOpacity": 5, "lineWidth": 1, "showPoints": "never" },
"unit": "s"
}
},
"gridPos": { "h": 9, "w": 24, "x": 0, "y": 9 },
"id": 3,
"options": {
"legend": { "displayMode": "table", "placement": "bottom", "showLegend": true },
"tooltip": { "mode": "multi" }
},
"targets": [
{
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
"expr": "histogram_quantile(0.50, sum by (le, handler) (rate(brightstaff_http_request_duration_seconds_bucket[5m])))",
"legendFormat": "p50 {{handler}}",
"refId": "A"
},
{
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
"expr": "histogram_quantile(0.95, sum by (le, handler) (rate(brightstaff_http_request_duration_seconds_bucket[5m])))",
"legendFormat": "p95 {{handler}}",
"refId": "B"
},
{
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
"expr": "histogram_quantile(0.99, sum by (le, handler) (rate(brightstaff_http_request_duration_seconds_bucket[5m])))",
"legendFormat": "p99 {{handler}}",
"refId": "C"
}
],
"title": "Duration — p50 / p95 / p99 by handler",
"type": "timeseries"
},
{
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
"description": "In-flight requests by handler. Climbs before latency does when brightstaff is saturated.",
"fieldConfig": {
"defaults": {
"color": { "mode": "palette-classic" },
"custom": { "drawStyle": "line", "fillOpacity": 10, "lineWidth": 1, "showPoints": "never" },
"unit": "short"
}
},
"gridPos": { "h": 8, "w": 24, "x": 0, "y": 18 },
"id": 4,
"options": {
"legend": { "displayMode": "table", "placement": "bottom", "showLegend": true },
"tooltip": { "mode": "multi" }
},
"targets": [
{
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
"expr": "sum by (handler) (brightstaff_http_in_flight_requests)",
"legendFormat": "{{handler}}",
"refId": "A"
}
],
"title": "In-flight requests by handler",
"type": "timeseries"
},
{
"collapsed": false,
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 26 },
"id": 200,
"panels": [],
"title": "LLM upstream",
"type": "row"
},
{
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
"fieldConfig": {
"defaults": {
"color": { "mode": "palette-classic" },
"custom": { "drawStyle": "line", "fillOpacity": 5, "lineWidth": 1, "showPoints": "never" },
"unit": "s"
}
},
"gridPos": { "h": 9, "w": 12, "x": 0, "y": 27 },
"id": 5,
"options": {
"legend": { "displayMode": "table", "placement": "bottom", "showLegend": true },
"tooltip": { "mode": "multi" }
},
"targets": [
{
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
"expr": "histogram_quantile(0.95, sum by (le, provider, model) (rate(brightstaff_llm_upstream_duration_seconds_bucket[5m])))",
"legendFormat": "p95 {{provider}}/{{model}}",
"refId": "A"
}
],
"title": "LLM upstream p95 by provider/model",
"type": "timeseries"
},
{
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
"description": "All non-success error classes. timeout/connect = network, 5xx/429 = provider, parse = body shape mismatch, stream = mid-stream disconnect.",
"fieldConfig": {
"defaults": {
"color": { "mode": "palette-classic" },
"custom": { "drawStyle": "line", "fillOpacity": 30, "lineWidth": 1, "showPoints": "never", "stacking": { "mode": "normal" } },
"unit": "reqps"
}
},
"gridPos": { "h": 9, "w": 12, "x": 12, "y": 27 },
"id": 6,
"options": {
"legend": { "displayMode": "table", "placement": "bottom", "showLegend": true },
"tooltip": { "mode": "multi" }
},
"targets": [
{
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
"expr": "sum by (provider, error_class) (rate(brightstaff_llm_upstream_requests_total{error_class!=\"none\"}[5m]))",
"legendFormat": "{{provider}} / {{error_class}}",
"refId": "A"
}
],
"title": "LLM upstream errors by provider / class",
"type": "timeseries"
},
{
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
"description": "Streaming only. Empty if the route never streams.",
"fieldConfig": {
"defaults": {
"color": { "mode": "palette-classic" },
"custom": { "drawStyle": "line", "fillOpacity": 5, "lineWidth": 1, "showPoints": "never" },
"unit": "s"
}
},
"gridPos": { "h": 9, "w": 12, "x": 0, "y": 36 },
"id": 7,
"options": {
"legend": { "displayMode": "table", "placement": "bottom", "showLegend": true },
"tooltip": { "mode": "multi" }
},
"targets": [
{
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
"expr": "histogram_quantile(0.95, sum by (le, provider, model) (rate(brightstaff_llm_time_to_first_token_seconds_bucket[5m])))",
"legendFormat": "p95 {{provider}}/{{model}}",
"refId": "A"
}
],
"title": "Time-to-first-token p95 (streaming)",
"type": "timeseries"
},
{
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
"description": "Tokens/sec by provider/model/kind — proxy for cost. Stacked.",
"fieldConfig": {
"defaults": {
"color": { "mode": "palette-classic" },
"custom": { "drawStyle": "line", "fillOpacity": 30, "lineWidth": 1, "showPoints": "never", "stacking": { "mode": "normal" } },
"unit": "tokens/s"
}
},
"gridPos": { "h": 9, "w": 12, "x": 12, "y": 36 },
"id": 8,
"options": {
"legend": { "displayMode": "table", "placement": "bottom", "showLegend": true },
"tooltip": { "mode": "multi" }
},
"targets": [
{
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
"expr": "sum by (provider, model, kind) (rate(brightstaff_llm_tokens_total[5m]))",
"legendFormat": "{{provider}}/{{model}} {{kind}}",
"refId": "A"
}
],
"title": "Token throughput by provider / model / kind",
"type": "timeseries"
},
{
"collapsed": false,
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 45 },
"id": 300,
"panels": [],
"title": "Routing service",
"type": "row"
},
{
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
"description": "Which models the orchestrator picked over the last 15 minutes.",
"fieldConfig": {
"defaults": {
"color": { "mode": "palette-classic" },
"unit": "short"
}
},
"gridPos": { "h": 9, "w": 12, "x": 0, "y": 46 },
"id": 9,
"options": {
"displayMode": "gradient",
"orientation": "horizontal",
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }
},
"targets": [
{
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
"expr": "sum by (selected_model) (increase(brightstaff_router_decisions_total[15m]))",
"legendFormat": "{{selected_model}}",
"refId": "A"
}
],
"title": "Model selection distribution (last 15m)",
"type": "bargauge"
},
{
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
"description": "Fraction of decisions that fell back (orchestrator returned `none` or errored). High = router can't classify intent or no candidates configured.",
"fieldConfig": {
"defaults": {
"color": { "mode": "palette-classic" },
"custom": { "drawStyle": "line", "fillOpacity": 10, "lineWidth": 1, "showPoints": "never" },
"unit": "percentunit"
}
},
"gridPos": { "h": 9, "w": 12, "x": 12, "y": 46 },
"id": 10,
"options": {
"legend": { "displayMode": "table", "placement": "bottom", "showLegend": true },
"tooltip": { "mode": "multi" }
},
"targets": [
{
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
"expr": "sum by (route) (rate(brightstaff_router_decisions_total{fallback=\"true\"}[5m])) / clamp_min(sum by (route) (rate(brightstaff_router_decisions_total[5m])), 1)",
"legendFormat": "{{route}}",
"refId": "A"
}
],
"title": "Fallback rate by route",
"type": "timeseries"
},
{
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
"fieldConfig": {
"defaults": {
"color": { "mode": "palette-classic" },
"custom": { "drawStyle": "line", "fillOpacity": 5, "lineWidth": 1, "showPoints": "never" },
"unit": "s"
}
},
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 55 },
"id": 11,
"options": {
"legend": { "displayMode": "table", "placement": "bottom", "showLegend": true },
"tooltip": { "mode": "multi" }
},
"targets": [
{
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
"expr": "histogram_quantile(0.95, sum by (le, route) (rate(brightstaff_router_decision_duration_seconds_bucket[5m])))",
"legendFormat": "p95 {{route}}",
"refId": "A"
}
],
"title": "Router decision p95 latency",
"type": "timeseries"
},
{
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
"description": "Hit / (hit + miss). Low ratio = sessions aren't being reused or TTL too short.",
"fieldConfig": {
"defaults": {
"color": { "mode": "thresholds" },
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "red", "value": null },
{ "color": "yellow", "value": 0.5 },
{ "color": "green", "value": 0.8 }
]
},
"unit": "percentunit",
"min": 0,
"max": 1
}
},
"gridPos": { "h": 8, "w": 6, "x": 12, "y": 55 },
"id": 12,
"options": {
"colorMode": "background",
"graphMode": "area",
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }
},
"targets": [
{
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
"expr": "sum(rate(brightstaff_session_cache_events_total{outcome=\"hit\"}[5m])) / clamp_min(sum(rate(brightstaff_session_cache_events_total{outcome=~\"hit|miss\"}[5m])), 1)",
"legendFormat": "hit rate",
"refId": "A"
}
],
"title": "Session cache hit rate",
"type": "stat"
},
{
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
"description": "decision_served = a real model picked. no_candidates = sentinel `none` returned. policy_error = orchestrator failed.",
"fieldConfig": {
"defaults": {
"color": { "mode": "palette-classic" },
"custom": { "drawStyle": "line", "fillOpacity": 30, "lineWidth": 1, "showPoints": "never", "stacking": { "mode": "normal" } },
"unit": "reqps"
}
},
"gridPos": { "h": 8, "w": 6, "x": 18, "y": 55 },
"id": 13,
"options": {
"legend": { "displayMode": "list", "placement": "bottom", "showLegend": true },
"tooltip": { "mode": "multi" }
},
"targets": [
{
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
"expr": "sum by (outcome) (rate(brightstaff_routing_service_requests_total[5m]))",
"legendFormat": "{{outcome}}",
"refId": "A"
}
],
"title": "/routing/* outcomes",
"type": "timeseries"
},
{
"collapsed": false,
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 63 },
"id": 400,
"panels": [],
"title": "Process & Envoy link",
"type": "row"
},
{
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
"description": "Compare to brightstaff RPS (panel 1) — sustained gap = network or Envoy queueing.",
"fieldConfig": {
"defaults": {
"color": { "mode": "palette-classic" },
"custom": { "drawStyle": "line", "fillOpacity": 10, "lineWidth": 1, "showPoints": "never" },
"unit": "reqps"
}
},
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 64 },
"id": 14,
"options": {
"legend": { "displayMode": "table", "placement": "bottom", "showLegend": true },
"tooltip": { "mode": "multi" }
},
"targets": [
{
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
"expr": "sum(rate(envoy_cluster_upstream_rq_total{envoy_cluster_name=\"bright_staff\"}[1m]))",
"legendFormat": "envoy → bright_staff",
"refId": "A"
},
{
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
"expr": "sum(rate(brightstaff_http_requests_total[1m]))",
"legendFormat": "brightstaff served",
"refId": "B"
}
],
"title": "Envoy → brightstaff link health",
"type": "timeseries"
},
{
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
"fieldConfig": {
"defaults": {
"color": { "mode": "palette-classic" },
"custom": { "drawStyle": "line", "fillOpacity": 10, "lineWidth": 1, "showPoints": "never" }
},
"overrides": [
{
"matcher": { "id": "byName", "options": "RSS" },
"properties": [{ "id": "unit", "value": "bytes" }]
},
{
"matcher": { "id": "byName", "options": "CPU" },
"properties": [{ "id": "unit", "value": "percentunit" }]
}
]
},
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 64 },
"id": 15,
"options": {
"legend": { "displayMode": "table", "placement": "bottom", "showLegend": true },
"tooltip": { "mode": "multi" }
},
"targets": [
{
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
"expr": "process_resident_memory_bytes{job=\"brightstaff\"}",
"legendFormat": "RSS",
"refId": "A"
},
{
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
"expr": "rate(process_cpu_seconds_total{job=\"brightstaff\"}[1m])",
"legendFormat": "CPU",
"refId": "B"
}
],
"title": "Brightstaff process RSS / CPU",
"type": "timeseries"
}
],
"refresh": "30s",
"schemaVersion": 39,
"tags": ["plano", "brightstaff", "llm"],
"templating": {
"list": [
{
"name": "DS_PROMETHEUS",
"label": "Prometheus",
"type": "datasource",
"query": "prometheus",
"current": { "selected": false, "text": "Prometheus", "value": "DS_PROMETHEUS" },
"hide": 0,
"refresh": 1,
"regex": "",
"skipUrlSync": false,
"includeAll": false,
"multi": false
}
]
},
"time": { "from": "now-1h", "to": "now" },
"timepicker": {},
"timezone": "browser",
"title": "Brightstaff (Plano dataplane)",
"uid": "brightstaff",
"version": 1,
"weekStart": ""
}