diff --git a/config/grafana/brightstaff_dashboard.json b/config/grafana/brightstaff_dashboard.json new file mode 100644 index 00000000..4b54721f --- /dev/null +++ b/config/grafana/brightstaff_dashboard.json @@ -0,0 +1,541 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "description": "RED, LLM upstream, routing service, and process metrics for brightstaff. Pair with Envoy admin metrics from cluster=bright_staff.", + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 1, + "id": null, + "links": [], + "liveNow": false, + "panels": [ + { + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 }, + "id": 100, + "panels": [], + "title": "HTTP RED", + "type": "row" + }, + { + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisLabel": "req/s", + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "reqps" + } + }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 1 }, + "id": 1, + "options": { + "legend": { "displayMode": "table", "placement": "bottom", "showLegend": true }, + "tooltip": { "mode": "multi" } + }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "expr": "sum by (handler) (rate(brightstaff_http_requests_total[1m]))", + "legendFormat": "{{handler}}", + "refId": "A" + } + ], + "title": "Rate — brightstaff RPS by handler", + "type": "timeseries" + }, + { + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "description": "5xx fraction over 5m. Page-worthy when sustained above ~1%.", + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 0.01 }, + { "color": "red", "value": 0.05 } + ] + }, + "unit": "percentunit" + } + }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 1 }, + "id": 2, + "options": { + "colorMode": "background", + "graphMode": "area", + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false } + }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "expr": "sum(rate(brightstaff_http_requests_total{status_class=\"5xx\"}[5m])) / clamp_min(sum(rate(brightstaff_http_requests_total[5m])), 1)", + "legendFormat": "5xx rate", + "refId": "A" + } + ], + "title": "Errors — brightstaff 5xx rate", + "type": "stat" + }, + { + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "description": "p50/p95/p99 by handler, computed from histogram buckets over 5m.", + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { "drawStyle": "line", "fillOpacity": 5, "lineWidth": 1, "showPoints": "never" }, + "unit": "s" + } + }, + "gridPos": { "h": 9, "w": 24, "x": 0, "y": 9 }, + "id": 3, + "options": { + "legend": { "displayMode": "table", "placement": "bottom", "showLegend": true }, + "tooltip": { "mode": "multi" } + }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "expr": "histogram_quantile(0.50, sum by (le, handler) (rate(brightstaff_http_request_duration_seconds_bucket[5m])))", + "legendFormat": "p50 {{handler}}", + "refId": "A" + }, + { + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "expr": "histogram_quantile(0.95, sum by (le, handler) (rate(brightstaff_http_request_duration_seconds_bucket[5m])))", + "legendFormat": "p95 {{handler}}", + "refId": "B" + }, + { + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "expr": "histogram_quantile(0.99, sum by (le, handler) (rate(brightstaff_http_request_duration_seconds_bucket[5m])))", + "legendFormat": "p99 {{handler}}", + "refId": "C" + } + ], + "title": "Duration — p50 / p95 / p99 by handler", + "type": "timeseries" + }, + { + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "description": "In-flight requests by handler. Climbs before latency does when brightstaff is saturated.", + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { "drawStyle": "line", "fillOpacity": 10, "lineWidth": 1, "showPoints": "never" }, + "unit": "short" + } + }, + "gridPos": { "h": 8, "w": 24, "x": 0, "y": 18 }, + "id": 4, + "options": { + "legend": { "displayMode": "table", "placement": "bottom", "showLegend": true }, + "tooltip": { "mode": "multi" } + }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "expr": "sum by (handler) (brightstaff_http_in_flight_requests)", + "legendFormat": "{{handler}}", + "refId": "A" + } + ], + "title": "In-flight requests by handler", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 26 }, + "id": 200, + "panels": [], + "title": "LLM upstream", + "type": "row" + }, + { + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { "drawStyle": "line", "fillOpacity": 5, "lineWidth": 1, "showPoints": "never" }, + "unit": "s" + } + }, + "gridPos": { "h": 9, "w": 12, "x": 0, "y": 27 }, + "id": 5, + "options": { + "legend": { "displayMode": "table", "placement": "bottom", "showLegend": true }, + "tooltip": { "mode": "multi" } + }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "expr": "histogram_quantile(0.95, sum by (le, provider, model) (rate(brightstaff_llm_upstream_duration_seconds_bucket[5m])))", + "legendFormat": "p95 {{provider}}/{{model}}", + "refId": "A" + } + ], + "title": "LLM upstream p95 by provider/model", + "type": "timeseries" + }, + { + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "description": "All non-success error classes. timeout/connect = network, 5xx/429 = provider, parse = body shape mismatch, stream = mid-stream disconnect.", + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { "drawStyle": "line", "fillOpacity": 30, "lineWidth": 1, "showPoints": "never", "stacking": { "mode": "normal" } }, + "unit": "reqps" + } + }, + "gridPos": { "h": 9, "w": 12, "x": 12, "y": 27 }, + "id": 6, + "options": { + "legend": { "displayMode": "table", "placement": "bottom", "showLegend": true }, + "tooltip": { "mode": "multi" } + }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "expr": "sum by (provider, error_class) (rate(brightstaff_llm_upstream_requests_total{error_class!=\"none\"}[5m]))", + "legendFormat": "{{provider}} / {{error_class}}", + "refId": "A" + } + ], + "title": "LLM upstream errors by provider / class", + "type": "timeseries" + }, + { + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "description": "Streaming only. Empty if the route never streams.", + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { "drawStyle": "line", "fillOpacity": 5, "lineWidth": 1, "showPoints": "never" }, + "unit": "s" + } + }, + "gridPos": { "h": 9, "w": 12, "x": 0, "y": 36 }, + "id": 7, + "options": { + "legend": { "displayMode": "table", "placement": "bottom", "showLegend": true }, + "tooltip": { "mode": "multi" } + }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "expr": "histogram_quantile(0.95, sum by (le, provider, model) (rate(brightstaff_llm_time_to_first_token_seconds_bucket[5m])))", + "legendFormat": "p95 {{provider}}/{{model}}", + "refId": "A" + } + ], + "title": "Time-to-first-token p95 (streaming)", + "type": "timeseries" + }, + { + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "description": "Tokens/sec by provider/model/kind — proxy for cost. Stacked.", + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { "drawStyle": "line", "fillOpacity": 30, "lineWidth": 1, "showPoints": "never", "stacking": { "mode": "normal" } }, + "unit": "tokens/s" + } + }, + "gridPos": { "h": 9, "w": 12, "x": 12, "y": 36 }, + "id": 8, + "options": { + "legend": { "displayMode": "table", "placement": "bottom", "showLegend": true }, + "tooltip": { "mode": "multi" } + }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "expr": "sum by (provider, model, kind) (rate(brightstaff_llm_tokens_total[5m]))", + "legendFormat": "{{provider}}/{{model}} {{kind}}", + "refId": "A" + } + ], + "title": "Token throughput by provider / model / kind", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 45 }, + "id": 300, + "panels": [], + "title": "Routing service", + "type": "row" + }, + { + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "description": "Which models the orchestrator picked over the last 15 minutes.", + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "unit": "short" + } + }, + "gridPos": { "h": 9, "w": 12, "x": 0, "y": 46 }, + "id": 9, + "options": { + "displayMode": "gradient", + "orientation": "horizontal", + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false } + }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "expr": "sum by (selected_model) (increase(brightstaff_router_decisions_total[15m]))", + "legendFormat": "{{selected_model}}", + "refId": "A" + } + ], + "title": "Model selection distribution (last 15m)", + "type": "bargauge" + }, + { + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "description": "Fraction of decisions that fell back (orchestrator returned `none` or errored). High = router can't classify intent or no candidates configured.", + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { "drawStyle": "line", "fillOpacity": 10, "lineWidth": 1, "showPoints": "never" }, + "unit": "percentunit" + } + }, + "gridPos": { "h": 9, "w": 12, "x": 12, "y": 46 }, + "id": 10, + "options": { + "legend": { "displayMode": "table", "placement": "bottom", "showLegend": true }, + "tooltip": { "mode": "multi" } + }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "expr": "sum by (route) (rate(brightstaff_router_decisions_total{fallback=\"true\"}[5m])) / clamp_min(sum by (route) (rate(brightstaff_router_decisions_total[5m])), 1)", + "legendFormat": "{{route}}", + "refId": "A" + } + ], + "title": "Fallback rate by route", + "type": "timeseries" + }, + { + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { "drawStyle": "line", "fillOpacity": 5, "lineWidth": 1, "showPoints": "never" }, + "unit": "s" + } + }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 55 }, + "id": 11, + "options": { + "legend": { "displayMode": "table", "placement": "bottom", "showLegend": true }, + "tooltip": { "mode": "multi" } + }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "expr": "histogram_quantile(0.95, sum by (le, route) (rate(brightstaff_router_decision_duration_seconds_bucket[5m])))", + "legendFormat": "p95 {{route}}", + "refId": "A" + } + ], + "title": "Router decision p95 latency", + "type": "timeseries" + }, + { + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "description": "Hit / (hit + miss). Low ratio = sessions aren't being reused or TTL too short.", + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "red", "value": null }, + { "color": "yellow", "value": 0.5 }, + { "color": "green", "value": 0.8 } + ] + }, + "unit": "percentunit", + "min": 0, + "max": 1 + } + }, + "gridPos": { "h": 8, "w": 6, "x": 12, "y": 55 }, + "id": 12, + "options": { + "colorMode": "background", + "graphMode": "area", + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false } + }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "expr": "sum(rate(brightstaff_session_cache_events_total{outcome=\"hit\"}[5m])) / clamp_min(sum(rate(brightstaff_session_cache_events_total{outcome=~\"hit|miss\"}[5m])), 1)", + "legendFormat": "hit rate", + "refId": "A" + } + ], + "title": "Session cache hit rate", + "type": "stat" + }, + { + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "description": "decision_served = a real model picked. no_candidates = sentinel `none` returned. policy_error = orchestrator failed.", + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { "drawStyle": "line", "fillOpacity": 30, "lineWidth": 1, "showPoints": "never", "stacking": { "mode": "normal" } }, + "unit": "reqps" + } + }, + "gridPos": { "h": 8, "w": 6, "x": 18, "y": 55 }, + "id": 13, + "options": { + "legend": { "displayMode": "list", "placement": "bottom", "showLegend": true }, + "tooltip": { "mode": "multi" } + }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "expr": "sum by (outcome) (rate(brightstaff_routing_service_requests_total[5m]))", + "legendFormat": "{{outcome}}", + "refId": "A" + } + ], + "title": "/routing/* outcomes", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 63 }, + "id": 400, + "panels": [], + "title": "Process & Envoy link", + "type": "row" + }, + { + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "description": "Compare to brightstaff RPS (panel 1) — sustained gap = network or Envoy queueing.", + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { "drawStyle": "line", "fillOpacity": 10, "lineWidth": 1, "showPoints": "never" }, + "unit": "reqps" + } + }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 64 }, + "id": 14, + "options": { + "legend": { "displayMode": "table", "placement": "bottom", "showLegend": true }, + "tooltip": { "mode": "multi" } + }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "expr": "sum(rate(envoy_cluster_upstream_rq_total{envoy_cluster_name=\"bright_staff\"}[1m]))", + "legendFormat": "envoy → bright_staff", + "refId": "A" + }, + { + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "expr": "sum(rate(brightstaff_http_requests_total[1m]))", + "legendFormat": "brightstaff served", + "refId": "B" + } + ], + "title": "Envoy → brightstaff link health", + "type": "timeseries" + }, + { + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { "drawStyle": "line", "fillOpacity": 10, "lineWidth": 1, "showPoints": "never" } + }, + "overrides": [ + { + "matcher": { "id": "byName", "options": "RSS" }, + "properties": [{ "id": "unit", "value": "bytes" }] + }, + { + "matcher": { "id": "byName", "options": "CPU" }, + "properties": [{ "id": "unit", "value": "percentunit" }] + } + ] + }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 64 }, + "id": 15, + "options": { + "legend": { "displayMode": "table", "placement": "bottom", "showLegend": true }, + "tooltip": { "mode": "multi" } + }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "expr": "process_resident_memory_bytes{job=\"brightstaff\"}", + "legendFormat": "RSS", + "refId": "A" + }, + { + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "expr": "rate(process_cpu_seconds_total{job=\"brightstaff\"}[1m])", + "legendFormat": "CPU", + "refId": "B" + } + ], + "title": "Brightstaff process RSS / CPU", + "type": "timeseries" + } + ], + "refresh": "30s", + "schemaVersion": 39, + "tags": ["plano", "brightstaff", "llm"], + "templating": { + "list": [ + { + "name": "DS_PROMETHEUS", + "label": "Prometheus", + "type": "datasource", + "query": "prometheus", + "current": { "selected": false, "text": "Prometheus", "value": "DS_PROMETHEUS" }, + "hide": 0, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "includeAll": false, + "multi": false + } + ] + }, + "time": { "from": "now-1h", "to": "now" }, + "timepicker": {}, + "timezone": "browser", + "title": "Brightstaff (Plano dataplane)", + "uid": "brightstaff", + "version": 1, + "weekStart": "" +} diff --git a/config/grafana/docker-compose.yaml b/config/grafana/docker-compose.yaml new file mode 100644 index 00000000..33238073 --- /dev/null +++ b/config/grafana/docker-compose.yaml @@ -0,0 +1,43 @@ +# One-command Prometheus + Grafana stack for observing a locally-running +# Plano (Envoy admin :9901 + brightstaff :9092 on the host). +# +# cd config/grafana +# docker compose up -d +# open http://localhost:3000 (admin / admin) +# +# Grafana is preloaded with: +# - Prometheus datasource (uid=DS_PROMETHEUS) → http://prometheus:9090 +# - Brightstaff dashboard (auto-imported from brightstaff_dashboard.json) +# +# Prometheus scrapes the host's :9092 and :9901 via host.docker.internal. +# On Linux this works because of the `extra_hosts: host-gateway` mapping +# below. On Mac/Win it works natively. + +services: + prometheus: + image: prom/prometheus:latest + container_name: plano-prometheus + ports: + - "9090:9090" + volumes: + - ./prometheus_scrape.yaml:/etc/prometheus/prometheus.yml:ro + extra_hosts: + - "host.docker.internal:host-gateway" + restart: unless-stopped + + grafana: + image: grafana/grafana:latest + container_name: plano-grafana + ports: + - "3000:3000" + environment: + GF_SECURITY_ADMIN_USER: admin + GF_SECURITY_ADMIN_PASSWORD: admin + GF_AUTH_ANONYMOUS_ENABLED: "true" + GF_AUTH_ANONYMOUS_ORG_ROLE: Viewer + volumes: + - ./provisioning:/etc/grafana/provisioning:ro + - ./brightstaff_dashboard.json:/var/lib/grafana/dashboards/brightstaff_dashboard.json:ro + depends_on: + - prometheus + restart: unless-stopped diff --git a/config/grafana/prometheus_scrape.yaml b/config/grafana/prometheus_scrape.yaml new file mode 100644 index 00000000..b4041287 --- /dev/null +++ b/config/grafana/prometheus_scrape.yaml @@ -0,0 +1,44 @@ +# Prometheus config that scrapes Plano (Envoy admin + brightstaff). This is +# a complete Prometheus config — mount it directly at +# /etc/prometheus/prometheus.yml. The included docker-compose.yaml does this +# for you. +# +# Targets: +# - envoy:9901 Envoy admin → envoy_cluster_*, envoy_http_*, envoy_server_*. +# - brightstaff:9092 Native dataplane → brightstaff_http_*, brightstaff_llm_*, +# brightstaff_router_*, process_*. +# +# Hostname `host.docker.internal` works on Docker Desktop (Mac/Win) and on +# Linux when the container is started with `--add-host=host.docker.internal: +# host-gateway` (the included compose does this). If Plano runs *inside* +# Docker on the same network as Prometheus, replace it with the container +# name (e.g. `plano:9092`). +# +# This file is unrelated to demos/llm_routing/model_routing_service/prometheus.yaml, +# which scrapes a fake metrics service to feed the routing engine. + +global: + scrape_interval: 15s + scrape_timeout: 10s + evaluation_interval: 15s + +scrape_configs: + - job_name: envoy + honor_timestamps: true + metrics_path: /stats + params: + format: ["prometheus"] + static_configs: + - targets: + - host.docker.internal:9901 + labels: + service: plano + + - job_name: brightstaff + honor_timestamps: true + metrics_path: /metrics + static_configs: + - targets: + - host.docker.internal:9092 + labels: + service: plano diff --git a/config/grafana/provisioning/dashboards/brightstaff.yaml b/config/grafana/provisioning/dashboards/brightstaff.yaml new file mode 100644 index 00000000..271e4a9b --- /dev/null +++ b/config/grafana/provisioning/dashboards/brightstaff.yaml @@ -0,0 +1,15 @@ +# Auto-load the brightstaff dashboard JSON on Grafana startup. + +apiVersion: 1 + +providers: + - name: brightstaff + orgId: 1 + folder: Plano + type: file + disableDeletion: false + updateIntervalSeconds: 30 + allowUiUpdates: true + options: + path: /var/lib/grafana/dashboards + foldersFromFilesStructure: false diff --git a/config/grafana/provisioning/datasources/prometheus.yaml b/config/grafana/provisioning/datasources/prometheus.yaml new file mode 100644 index 00000000..2e3170ec --- /dev/null +++ b/config/grafana/provisioning/datasources/prometheus.yaml @@ -0,0 +1,14 @@ +# Auto-provision the Prometheus datasource so the bundled dashboard wires up +# without any clicks. The `uid: DS_PROMETHEUS` matches the templated input in +# brightstaff_dashboard.json. + +apiVersion: 1 + +datasources: + - name: Prometheus + uid: DS_PROMETHEUS + type: prometheus + access: proxy + url: http://prometheus:9090 + isDefault: true + editable: true diff --git a/crates/Cargo.lock b/crates/Cargo.lock index e07b47ee..56fc260c 100644 --- a/crates/Cargo.lock +++ b/crates/Cargo.lock @@ -23,6 +23,18 @@ version = "0.3.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e8fd72866655d1904d6b0997d0b07ba561047d070fbe29de039031c641b61217" +[[package]] +name = "ahash" +version = "0.8.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5a15f179cd60c4584b8a8c596927aadc462e27f2ca70c04e0071964a73ba7a75" +dependencies = [ + "cfg-if", + "once_cell", + "version_check", + "zerocopy", +] + [[package]] name = "aho-corasick" version = "1.1.4" @@ -257,6 +269,24 @@ dependencies = [ "vsimd", ] +[[package]] +name = "bindgen" +version = "0.72.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "993776b509cfb49c750f11b8f07a46fa23e0a1386ffc01fb1e7d343efc387895" +dependencies = [ + "bitflags", + "cexpr", + "clang-sys", + "itertools 0.13.0", + "proc-macro2", + "quote", + "regex", + "rustc-hash 2.1.2", + "shlex", + "syn 2.0.117", +] + [[package]] name = "bit-set" version = "0.5.3" @@ -316,6 +346,9 @@ dependencies = [ "hyper 1.9.0", "hyper-util", "lru", + "metrics 0.23.1", + "metrics-exporter-prometheus", + "metrics-process", "mockito", "opentelemetry", "opentelemetry-http", @@ -391,6 +424,15 @@ dependencies = [ "shlex", ] +[[package]] +name = "cexpr" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6fac387a98bb7c37292057cffc56d62ecb629900026402633ae9160df93a8766" +dependencies = [ + "nom", +] + [[package]] name = "cfg-if" version = "1.0.4" @@ -428,6 +470,17 @@ dependencies = [ "windows-link", ] +[[package]] +name = "clang-sys" +version = "1.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b023947811758c97c59bf9d1c188fd619ad4718dcaa767947df1cadb14f39f4" +dependencies = [ + "glob", + "libc", + "libloading", +] + [[package]] name = "cmov" version = "0.5.3" @@ -574,6 +627,21 @@ dependencies = [ "cfg-if", ] +[[package]] +name = "crossbeam-epoch" +version = "0.9.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e" +dependencies = [ + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-utils" +version = "0.8.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28" + [[package]] name = "crypto-common" version = "0.1.7" @@ -1070,6 +1138,12 @@ dependencies = [ "wasip3", ] +[[package]] +name = "glob" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0cc23270f6e1808e30a928bdc84dea0b9b4136a8bc82338574f23baf47bbd280" + [[package]] name = "governor" version = "0.6.3" @@ -1128,7 +1202,7 @@ version = "0.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e91b62f79061a0bc2e046024cb7ba44b08419ed238ecbd9adbd787434b9e8c25" dependencies = [ - "ahash", + "ahash 0.3.8", "autocfg", ] @@ -1138,6 +1212,15 @@ version = "0.12.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8a9ee70c43aaf417c914396645a0fa852624801b24ebb7ae78fe8272889ac888" +[[package]] +name = "hashbrown" +version = "0.14.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e5274423e17b7c9fc20b6e7e208532f9b19825d82dfd615708b70edd83df41f1" +dependencies = [ + "ahash 0.8.12", +] + [[package]] name = "hashbrown" version = "0.15.5" @@ -1189,6 +1272,12 @@ dependencies = [ "uuid", ] +[[package]] +name = "hermit-abi" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fc0fef456e4baa96da950455cd02c081ca953b141298e41db3fc7e36b1da849c" + [[package]] name = "hex" version = "0.4.3" @@ -1665,6 +1754,27 @@ version = "0.2.185" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "52ff2c0fe9bc6cb6b14a0592c2ff4fa9ceb83eea9db979b0487cd054946a2b8f" +[[package]] +name = "libloading" +version = "0.8.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d7c4b02199fee7c5d21a5ae7d8cfa79a6ef5bb2fc834d6e9058e89c825efdc55" +dependencies = [ + "cfg-if", + "windows-link", +] + +[[package]] +name = "libproc" +version = "0.14.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a54ad7278b8bc5301d5ffd2a94251c004feb971feba96c971ea4063645990757" +dependencies = [ + "bindgen", + "errno", + "libc", +] + [[package]] name = "libredox" version = "0.1.16" @@ -1745,6 +1855,12 @@ version = "0.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "112b39cec0b298b6c1999fee3e31427f74f676e4cb9879ed1a121b43661a4154" +[[package]] +name = "mach2" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dae608c151f68243f2b000364e1f7b186d9c29845f7d2d85bd31b9ad77ad552b" + [[package]] name = "matchers" version = "0.2.0" @@ -1782,6 +1898,77 @@ version = "2.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f8ca58f447f06ed17d5fc4043ce1b10dd205e060fb3ce5b979b8ed8e59ff3f79" +[[package]] +name = "metrics" +version = "0.23.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3045b4193fbdc5b5681f32f11070da9be3609f189a79f3390706d42587f46bb5" +dependencies = [ + "ahash 0.8.12", + "portable-atomic", +] + +[[package]] +name = "metrics" +version = "0.24.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5d5312e9ba3771cfa961b585728215e3d972c950a3eed9252aa093d6301277e8" +dependencies = [ + "ahash 0.8.12", + "portable-atomic", +] + +[[package]] +name = "metrics-exporter-prometheus" +version = "0.15.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b4f0c8427b39666bf970460908b213ec09b3b350f20c0c2eabcbba51704a08e6" +dependencies = [ + "base64 0.22.1", + "http-body-util", + "hyper 1.9.0", + "hyper-util", + "indexmap 2.14.0", + "ipnet", + "metrics 0.23.1", + "metrics-util", + "quanta", + "thiserror 1.0.69", + "tokio", + "tracing", +] + +[[package]] +name = "metrics-process" +version = "2.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4268d87f64a752f5a651314fc683f04da10be65701ea3e721ba4d74f79163cac" +dependencies = [ + "libc", + "libproc", + "mach2", + "metrics 0.24.3", + "once_cell", + "procfs", + "rlimit", + "windows", +] + +[[package]] +name = "metrics-util" +version = "0.17.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4259040465c955f9f2f1a4a8a16dc46726169bca0f88e8fb2dbeced487c3e828" +dependencies = [ + "crossbeam-epoch", + "crossbeam-utils", + "hashbrown 0.14.5", + "metrics 0.23.1", + "num_cpus", + "quanta", + "sketches-ddsketch", +] + [[package]] name = "mime" version = "0.3.17" @@ -1935,6 +2122,16 @@ dependencies = [ "autocfg", ] +[[package]] +name = "num_cpus" +version = "1.17.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "91df4bbde75afed763b708b7eee1e8e7651e02d97f6d5dd763e89367e957b23b" +dependencies = [ + "hermit-abi", + "libc", +] + [[package]] name = "objc2-core-foundation" version = "0.3.2" @@ -2278,6 +2475,27 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "procfs" +version = "0.18.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "25485360a54d6861439d60facef26de713b1e126bf015ec8f98239467a2b82f7" +dependencies = [ + "bitflags", + "procfs-core", + "rustix", +] + +[[package]] +name = "procfs-core" +version = "0.18.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e6401bf7b6af22f78b563665d15a22e9aef27775b79b149a66ca022468a4e405" +dependencies = [ + "bitflags", + "hex", +] + [[package]] name = "prompt_gateway" version = "0.1.0" @@ -2333,6 +2551,21 @@ dependencies = [ "log", ] +[[package]] +name = "quanta" +version = "0.12.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f3ab5a9d756f0d97bdc89019bd2e4ea098cf9cde50ee7564dde6b81ccc8f06c7" +dependencies = [ + "crossbeam-utils", + "libc", + "once_cell", + "raw-cpuid", + "wasi 0.11.1+wasi-snapshot-preview1", + "web-sys", + "winapi", +] + [[package]] name = "quinn" version = "0.11.9" @@ -2485,6 +2718,15 @@ version = "0.10.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "63b8176103e19a2643978565ca18b50549f6101881c443590420e4dc998a3c69" +[[package]] +name = "raw-cpuid" +version = "11.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "498cd0dc59d73224351ee52a95fee0f1a617a2eae0e7d9d720cc622c73a54186" +dependencies = [ + "bitflags", +] + [[package]] name = "redis" version = "0.27.6" @@ -2646,6 +2888,15 @@ dependencies = [ "windows-sys 0.52.0", ] +[[package]] +name = "rlimit" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f35ee2729c56bb610f6dba436bf78135f728b7373bdffae2ec815b2d3eb98cc3" +dependencies = [ + "libc", +] + [[package]] name = "rustc-hash" version = "1.1.0" @@ -3098,6 +3349,12 @@ version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b2aa850e253778c88a04c3d7323b043aeda9d3e30d5971937c1855769763678e" +[[package]] +name = "sketches-ddsketch" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "85636c14b73d81f541e525f585c0a2109e6744e1565b5c1668e31c70c10ed65c" + [[package]] name = "slab" version = "0.4.12" @@ -4003,6 +4260,49 @@ dependencies = [ "web-sys", ] +[[package]] +name = "winapi" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419" +dependencies = [ + "winapi-i686-pc-windows-gnu", + "winapi-x86_64-pc-windows-gnu", +] + +[[package]] +name = "winapi-i686-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" + +[[package]] +name = "winapi-x86_64-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" + +[[package]] +name = "windows" +version = "0.62.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "527fadee13e0c05939a6a05d5bd6eec6cd2e3dbd648b9f8e447c6518133d8580" +dependencies = [ + "windows-collections", + "windows-core", + "windows-future", + "windows-numerics", +] + +[[package]] +name = "windows-collections" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "23b2d95af1a8a14a3c7367e1ed4fc9c20e0a26e79551b1454d72583c97cc6610" +dependencies = [ + "windows-core", +] + [[package]] name = "windows-core" version = "0.62.2" @@ -4016,6 +4316,17 @@ dependencies = [ "windows-strings", ] +[[package]] +name = "windows-future" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e1d6f90251fe18a279739e78025bd6ddc52a7e22f921070ccdc67dde84c605cb" +dependencies = [ + "windows-core", + "windows-link", + "windows-threading", +] + [[package]] name = "windows-implement" version = "0.60.2" @@ -4044,6 +4355,16 @@ version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5" +[[package]] +name = "windows-numerics" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6e2e40844ac143cdb44aead537bbf727de9b044e107a0f1220392177d15b0f26" +dependencies = [ + "windows-core", + "windows-link", +] + [[package]] name = "windows-registry" version = "0.6.1" @@ -4133,6 +4454,15 @@ dependencies = [ "windows_x86_64_msvc 0.53.1", ] +[[package]] +name = "windows-threading" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3949bd5b99cafdf1c7ca86b43ca564028dfe27d66958f2470940f73d86d75b37" +dependencies = [ + "windows-link", +] + [[package]] name = "windows_aarch64_gnullvm" version = "0.52.6" diff --git a/crates/brightstaff/Cargo.toml b/crates/brightstaff/Cargo.toml index f88ed918..b9718e44 100644 --- a/crates/brightstaff/Cargo.toml +++ b/crates/brightstaff/Cargo.toml @@ -27,6 +27,9 @@ opentelemetry_sdk = { version = "0.31", features = ["rt-tokio"] } pretty_assertions = "1.4.1" rand = "0.9.2" lru = "0.12" +metrics = "0.23" +metrics-exporter-prometheus = { version = "0.15", default-features = false, features = ["http-listener"] } +metrics-process = "2.1" redis = { version = "0.27", features = ["tokio-comp"] } reqwest = { version = "0.12.15", features = ["stream"] } serde = { version = "1.0.219", features = ["derive"] } diff --git a/crates/brightstaff/src/handlers/llm/mod.rs b/crates/brightstaff/src/handlers/llm/mod.rs index 719c048d..94930caa 100644 --- a/crates/brightstaff/src/handlers/llm/mod.rs +++ b/crates/brightstaff/src/handlers/llm/mod.rs @@ -24,13 +24,14 @@ use crate::app_state::AppState; use crate::handlers::agents::pipeline::PipelineProcessor; use crate::handlers::extract_request_id; use crate::handlers::full; +use crate::metrics as bs_metrics; use crate::state::response_state_processor::ResponsesStateProcessor; use crate::state::{ extract_input_items, retrieve_and_combine_input, StateStorage, StateStorageError, }; use crate::streaming::{ create_streaming_response, create_streaming_response_with_output_filter, truncate_message, - ObservableStreamProcessor, StreamProcessor, + LlmMetricsCtx, ObservableStreamProcessor, StreamProcessor, }; use crate::tracing::{ collect_custom_trace_attributes, llm as tracing_llm, operation_component, @@ -686,6 +687,13 @@ async fn send_upstream( let request_start_time = std::time::Instant::now(); + // Labels for LLM upstream metrics. We prefer `resolved_model` (post-routing) + // and derive the provider from its `provider/model` prefix. This matches the + // same model id the cost/latency router keys off. + let (metric_provider_raw, metric_model_raw) = bs_metrics::split_provider_model(resolved_model); + let metric_provider = metric_provider_raw.to_string(); + let metric_model = metric_model_raw.to_string(); + let llm_response = match http_client .post(upstream_url) .headers(request_headers.clone()) @@ -695,6 +703,14 @@ async fn send_upstream( { Ok(res) => res, Err(err) => { + let err_class = bs_metrics::llm_error_class_from_reqwest(&err); + bs_metrics::record_llm_upstream( + &metric_provider, + &metric_model, + 0, + err_class, + request_start_time.elapsed(), + ); let err_msg = format!("Failed to send request: {}", err); let mut internal_error = Response::new(full(err_msg)); *internal_error.status_mut() = StatusCode::INTERNAL_SERVER_ERROR; @@ -750,7 +766,12 @@ async fn send_upstream( span_name, request_start_time, messages_for_signals, - ); + ) + .with_llm_metrics(LlmMetricsCtx { + provider: metric_provider.clone(), + model: metric_model.clone(), + upstream_status: upstream_status.as_u16(), + }); let output_filter_request_headers = if filter_pipeline.has_output_filters() { Some(request_headers.clone()) diff --git a/crates/brightstaff/src/handlers/llm/model_selection.rs b/crates/brightstaff/src/handlers/llm/model_selection.rs index 1b4315e7..a1378d86 100644 --- a/crates/brightstaff/src/handlers/llm/model_selection.rs +++ b/crates/brightstaff/src/handlers/llm/model_selection.rs @@ -5,10 +5,24 @@ use hyper::StatusCode; use std::sync::Arc; use tracing::{debug, info, warn}; +use crate::metrics as bs_metrics; +use crate::metrics::labels as metric_labels; use crate::router::orchestrator::OrchestratorService; use crate::streaming::truncate_message; use crate::tracing::routing; +/// Classify a request path (already stripped of `/agents` or `/routing` by +/// the caller) into the fixed `route` label used on routing metrics. +fn route_label_for_path(request_path: &str) -> &'static str { + if request_path.starts_with("/agents") { + metric_labels::ROUTE_AGENT + } else if request_path.starts_with("/routing") { + metric_labels::ROUTE_ROUTING + } else { + metric_labels::ROUTE_LLM + } +} + pub struct RoutingResult { /// Primary model to use (first in the ranked list). pub model_name: String, @@ -106,15 +120,23 @@ pub async fn router_chat_get_upstream_model( ) .await; - let determination_ms = routing_start_time.elapsed().as_millis() as i64; + let determination_elapsed = routing_start_time.elapsed(); + let determination_ms = determination_elapsed.as_millis() as i64; let current_span = tracing::Span::current(); current_span.record(routing::ROUTE_DETERMINATION_MS, determination_ms); + let route_label = route_label_for_path(request_path); match routing_result { Ok(route) => match route { Some((route_name, ranked_models)) => { let model_name = ranked_models.first().cloned().unwrap_or_default(); current_span.record("route.selected_model", model_name.as_str()); + bs_metrics::record_router_decision( + route_label, + &model_name, + false, + determination_elapsed, + ); Ok(RoutingResult { model_name, models: ranked_models, @@ -126,6 +148,12 @@ pub async fn router_chat_get_upstream_model( // This signals to llm.rs to use the original validated request model current_span.record("route.selected_model", "none"); info!("no route determined, using default model"); + bs_metrics::record_router_decision( + route_label, + "none", + true, + determination_elapsed, + ); Ok(RoutingResult { model_name: "none".to_string(), @@ -136,6 +164,7 @@ pub async fn router_chat_get_upstream_model( }, Err(err) => { current_span.record("route.selected_model", "unknown"); + bs_metrics::record_router_decision(route_label, "unknown", true, determination_elapsed); Err(RoutingError::internal_error(format!( "Failed to determine route: {}", err diff --git a/crates/brightstaff/src/handlers/routing_service.rs b/crates/brightstaff/src/handlers/routing_service.rs index 5fc0d3b9..b93b1422 100644 --- a/crates/brightstaff/src/handlers/routing_service.rs +++ b/crates/brightstaff/src/handlers/routing_service.rs @@ -12,6 +12,8 @@ use tracing::{debug, info, info_span, warn, Instrument}; use super::extract_or_generate_traceparent; use crate::handlers::llm::model_selection::router_chat_get_upstream_model; +use crate::metrics as bs_metrics; +use crate::metrics::labels as metric_labels; use crate::router::orchestrator::OrchestratorService; use crate::tracing::{collect_custom_trace_attributes, operation_component, set_service_name}; @@ -230,6 +232,17 @@ async fn routing_decision_inner( pinned: false, }; + // Distinguish "decision served" (a concrete model picked) from + // "no_candidates" (the sentinel "none" returned when nothing + // matched). The handler still responds 200 in both cases, so RED + // metrics alone can't tell them apart. + let outcome = if response.models.first().map(|m| m == "none").unwrap_or(true) { + metric_labels::ROUTING_SVC_NO_CANDIDATES + } else { + metric_labels::ROUTING_SVC_DECISION_SERVED + }; + bs_metrics::record_routing_service_outcome(outcome); + info!( primary_model = %response.models.first().map(|s| s.as_str()).unwrap_or("none"), total_models = response.models.len(), @@ -249,6 +262,7 @@ async fn routing_decision_inner( .unwrap()) } Err(err) => { + bs_metrics::record_routing_service_outcome(metric_labels::ROUTING_SVC_POLICY_ERROR); warn!(error = %err.message, "routing decision failed"); Ok(BrightStaffError::InternalServerError(err.message).into_response()) } diff --git a/crates/brightstaff/src/lib.rs b/crates/brightstaff/src/lib.rs index a0ba5f43..66c6eadf 100644 --- a/crates/brightstaff/src/lib.rs +++ b/crates/brightstaff/src/lib.rs @@ -1,5 +1,6 @@ pub mod app_state; pub mod handlers; +pub mod metrics; pub mod router; pub mod session_cache; pub mod signals; diff --git a/crates/brightstaff/src/main.rs b/crates/brightstaff/src/main.rs index 40ac429d..80e03b4b 100644 --- a/crates/brightstaff/src/main.rs +++ b/crates/brightstaff/src/main.rs @@ -5,6 +5,8 @@ use brightstaff::handlers::function_calling::function_calling_chat_handler; use brightstaff::handlers::llm::llm_chat; use brightstaff::handlers::models::list_models; use brightstaff::handlers::routing_service::routing_decision; +use brightstaff::metrics as bs_metrics; +use brightstaff::metrics::labels as metric_labels; use brightstaff::router::model_metrics::ModelMetricsService; use brightstaff::router::orchestrator::OrchestratorService; use brightstaff::session_cache::init_session_cache; @@ -384,10 +386,79 @@ async fn init_state_storage( // Request routing // --------------------------------------------------------------------------- +/// Normalized method label — limited set so we never emit a free-form string. +fn method_label(method: &Method) -> &'static str { + match *method { + Method::GET => "GET", + Method::POST => "POST", + Method::PUT => "PUT", + Method::DELETE => "DELETE", + Method::PATCH => "PATCH", + Method::HEAD => "HEAD", + Method::OPTIONS => "OPTIONS", + _ => "OTHER", + } +} + +/// Compute the fixed `handler` metric label from the request's path+method. +/// Returning `None` for fall-through means `route()` will hand the request to +/// the catch-all 404 branch. +fn handler_label_for(method: &Method, path: &str) -> &'static str { + if let Some(stripped) = path.strip_prefix("/agents") { + if matches!( + stripped, + CHAT_COMPLETIONS_PATH | MESSAGES_PATH | OPENAI_RESPONSES_API_PATH + ) { + return metric_labels::HANDLER_AGENT_CHAT; + } + } + if let Some(stripped) = path.strip_prefix("/routing") { + if matches!( + stripped, + CHAT_COMPLETIONS_PATH | MESSAGES_PATH | OPENAI_RESPONSES_API_PATH + ) { + return metric_labels::HANDLER_ROUTING_DECISION; + } + } + match (method, path) { + (&Method::POST, CHAT_COMPLETIONS_PATH | MESSAGES_PATH | OPENAI_RESPONSES_API_PATH) => { + metric_labels::HANDLER_LLM_CHAT + } + (&Method::POST, "/function_calling") => metric_labels::HANDLER_FUNCTION_CALLING, + (&Method::GET, "/v1/models" | "/agents/v1/models") => metric_labels::HANDLER_LIST_MODELS, + (&Method::OPTIONS, "/v1/models" | "/agents/v1/models") => { + metric_labels::HANDLER_CORS_PREFLIGHT + } + _ => metric_labels::HANDLER_NOT_FOUND, + } +} + /// Route an incoming HTTP request to the appropriate handler. async fn route( req: Request, state: Arc, +) -> Result>, hyper::Error> { + let handler = handler_label_for(req.method(), req.uri().path()); + let method = method_label(req.method()); + let started = std::time::Instant::now(); + let _in_flight = bs_metrics::InFlightGuard::new(handler); + + let result = dispatch(req, state).await; + + let status = match &result { + Ok(resp) => resp.status().as_u16(), + // hyper::Error here means the body couldn't be produced; conventionally 500. + Err(_) => 500, + }; + bs_metrics::record_http(handler, method, status, started); + result +} + +/// Inner dispatcher split out so `route()` can wrap it with metrics without +/// duplicating the match tree. +async fn dispatch( + req: Request, + state: Arc, ) -> Result>, hyper::Error> { let parent_cx = global::get_text_map_propagator(|p| p.extract(&HeaderExtractor(req.headers()))); let path = req.uri().path().to_string(); @@ -503,6 +574,7 @@ async fn run_server(state: Arc) -> Result<(), Box Result<(), Box> { let config = load_config()?; let _tracer_provider = init_tracer(config.tracing.as_ref()); + bs_metrics::init(); info!("loaded plano_config.yaml"); let state = Arc::new(init_app_state(&config).await?); run_server(state).await diff --git a/crates/brightstaff/src/metrics/labels.rs b/crates/brightstaff/src/metrics/labels.rs new file mode 100644 index 00000000..4eaf3e59 --- /dev/null +++ b/crates/brightstaff/src/metrics/labels.rs @@ -0,0 +1,38 @@ +//! Fixed label-value constants so callers never emit free-form strings +//! (which would blow up cardinality). + +// Handler enum — derived from the path+method match in `route()`. +pub const HANDLER_AGENT_CHAT: &str = "agent_chat"; +pub const HANDLER_ROUTING_DECISION: &str = "routing_decision"; +pub const HANDLER_LLM_CHAT: &str = "llm_chat"; +pub const HANDLER_FUNCTION_CALLING: &str = "function_calling"; +pub const HANDLER_LIST_MODELS: &str = "list_models"; +pub const HANDLER_CORS_PREFLIGHT: &str = "cors_preflight"; +pub const HANDLER_NOT_FOUND: &str = "not_found"; + +// Router "route" class — which brightstaff endpoint prompted the decision. +pub const ROUTE_AGENT: &str = "agent"; +pub const ROUTE_ROUTING: &str = "routing"; +pub const ROUTE_LLM: &str = "llm"; + +// Token kind for brightstaff_llm_tokens_total. +pub const TOKEN_KIND_PROMPT: &str = "prompt"; +pub const TOKEN_KIND_COMPLETION: &str = "completion"; + +// LLM error_class values (match docstring in metrics/mod.rs). +pub const LLM_ERR_NONE: &str = "none"; +pub const LLM_ERR_TIMEOUT: &str = "timeout"; +pub const LLM_ERR_CONNECT: &str = "connect"; +pub const LLM_ERR_PARSE: &str = "parse"; +pub const LLM_ERR_OTHER: &str = "other"; +pub const LLM_ERR_STREAM: &str = "stream"; + +// Routing service outcome values. +pub const ROUTING_SVC_DECISION_SERVED: &str = "decision_served"; +pub const ROUTING_SVC_NO_CANDIDATES: &str = "no_candidates"; +pub const ROUTING_SVC_POLICY_ERROR: &str = "policy_error"; + +// Session cache outcome values. +pub const SESSION_CACHE_HIT: &str = "hit"; +pub const SESSION_CACHE_MISS: &str = "miss"; +pub const SESSION_CACHE_STORE: &str = "store"; diff --git a/crates/brightstaff/src/metrics/mod.rs b/crates/brightstaff/src/metrics/mod.rs new file mode 100644 index 00000000..34679cca --- /dev/null +++ b/crates/brightstaff/src/metrics/mod.rs @@ -0,0 +1,377 @@ +//! Prometheus metrics for brightstaff. +//! +//! Installs the `metrics` global recorder backed by +//! `metrics-exporter-prometheus` and exposes a `/metrics` HTTP endpoint on a +//! dedicated admin port (default `0.0.0.0:9092`, overridable via +//! `METRICS_BIND_ADDRESS`). +//! +//! Emitted metric families (see `describe_all` for full list): +//! - HTTP RED: `brightstaff_http_requests_total`, +//! `brightstaff_http_request_duration_seconds`, +//! `brightstaff_http_in_flight_requests`. +//! - LLM upstream: `brightstaff_llm_upstream_requests_total`, +//! `brightstaff_llm_upstream_duration_seconds`, +//! `brightstaff_llm_time_to_first_token_seconds`, +//! `brightstaff_llm_tokens_total`, +//! `brightstaff_llm_tokens_usage_missing_total`. +//! - Routing: `brightstaff_router_decisions_total`, +//! `brightstaff_router_decision_duration_seconds`, +//! `brightstaff_routing_service_requests_total`, +//! `brightstaff_session_cache_events_total`. +//! - Process: via `metrics-process`. +//! - Build: `brightstaff_build_info`. + +use std::net::SocketAddr; +use std::sync::OnceLock; +use std::time::{Duration, Instant}; + +use metrics::{counter, describe_counter, describe_gauge, describe_histogram, gauge, histogram}; +use metrics_exporter_prometheus::{Matcher, PrometheusBuilder}; +use tracing::{info, warn}; + +pub mod labels; + +/// Guard flag so tests don't re-install the global recorder. +static INIT: OnceLock<()> = OnceLock::new(); + +const DEFAULT_METRICS_BIND: &str = "0.0.0.0:9092"; + +/// HTTP request duration buckets (seconds). Capped at 60s. +const HTTP_BUCKETS: &[f64] = &[ + 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0, 30.0, 60.0, +]; + +/// LLM upstream / TTFT buckets (seconds). Capped at 120s because provider +/// completions routinely run that long. +const LLM_BUCKETS: &[f64] = &[0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0, 30.0, 60.0, 120.0]; + +/// Router decision buckets (seconds). The orchestrator call itself is usually +/// sub-second but bucketed generously in case of upstream slowness. +const ROUTER_BUCKETS: &[f64] = &[ + 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0, 30.0, +]; + +/// Install the global recorder and spawn the `/metrics` HTTP listener. +/// +/// Safe to call more than once; subsequent calls are no-ops so tests that +/// construct their own recorder still work. +pub fn init() { + if INIT.get().is_some() { + return; + } + + let bind: SocketAddr = std::env::var("METRICS_BIND_ADDRESS") + .unwrap_or_else(|_| DEFAULT_METRICS_BIND.to_string()) + .parse() + .unwrap_or_else(|err| { + warn!(error = %err, default = DEFAULT_METRICS_BIND, "invalid METRICS_BIND_ADDRESS, falling back to default"); + DEFAULT_METRICS_BIND.parse().expect("default bind parses") + }); + + let builder = PrometheusBuilder::new() + .with_http_listener(bind) + .set_buckets_for_metric( + Matcher::Full("brightstaff_http_request_duration_seconds".to_string()), + HTTP_BUCKETS, + ) + .and_then(|b| { + b.set_buckets_for_metric(Matcher::Prefix("brightstaff_llm_".to_string()), LLM_BUCKETS) + }) + .and_then(|b| { + b.set_buckets_for_metric( + Matcher::Full("brightstaff_router_decision_duration_seconds".to_string()), + ROUTER_BUCKETS, + ) + }); + + let builder = match builder { + Ok(b) => b, + Err(err) => { + warn!(error = %err, "failed to configure metrics buckets, using defaults"); + PrometheusBuilder::new().with_http_listener(bind) + } + }; + + if let Err(err) = builder.install() { + warn!(error = %err, "failed to install Prometheus recorder; metrics disabled"); + return; + } + + let _ = INIT.set(()); + + describe_all(); + emit_build_info(); + + // Register process-level collector (RSS, CPU, FDs). + let collector = metrics_process::Collector::default(); + collector.describe(); + // Prime once at startup; subsequent scrapes refresh via the exporter's + // per-scrape render, so we additionally refresh on a short interval to + // keep gauges moving between scrapes without requiring client pull. + collector.collect(); + tokio::spawn(async move { + let mut tick = tokio::time::interval(Duration::from_secs(10)); + tick.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Skip); + loop { + tick.tick().await; + collector.collect(); + } + }); + + info!(address = %bind, "metrics listener started"); +} + +fn describe_all() { + describe_counter!( + "brightstaff_http_requests_total", + "Total HTTP requests served by brightstaff, by handler and status class." + ); + describe_histogram!( + "brightstaff_http_request_duration_seconds", + "Wall-clock duration of HTTP requests served by brightstaff, by handler." + ); + describe_gauge!( + "brightstaff_http_in_flight_requests", + "Number of HTTP requests currently being served by brightstaff, by handler." + ); + + describe_counter!( + "brightstaff_llm_upstream_requests_total", + "LLM upstream request outcomes, by provider, model, status class and error class." + ); + describe_histogram!( + "brightstaff_llm_upstream_duration_seconds", + "Wall-clock duration of LLM upstream calls (stream close for streaming), by provider and model." + ); + describe_histogram!( + "brightstaff_llm_time_to_first_token_seconds", + "Time from request start to first streamed byte, by provider and model (streaming only)." + ); + describe_counter!( + "brightstaff_llm_tokens_total", + "Tokens reported in the provider `usage` field, by provider, model and kind (prompt/completion)." + ); + describe_counter!( + "brightstaff_llm_tokens_usage_missing_total", + "LLM responses that completed without a usable `usage` block (so token counts are unknown)." + ); + + describe_counter!( + "brightstaff_router_decisions_total", + "Routing decisions made by the orchestrator, by route, selected model, and whether a fallback was used." + ); + describe_histogram!( + "brightstaff_router_decision_duration_seconds", + "Time spent in the orchestrator deciding a route, by route." + ); + describe_counter!( + "brightstaff_routing_service_requests_total", + "Outcomes of /routing/* decision requests: decision_served, no_candidates, policy_error." + ); + describe_counter!( + "brightstaff_session_cache_events_total", + "Session affinity cache lookups and stores, by outcome." + ); + + describe_gauge!( + "brightstaff_build_info", + "Build metadata. Always 1; labels carry version and git SHA." + ); +} + +fn emit_build_info() { + let version = env!("CARGO_PKG_VERSION"); + let git_sha = option_env!("GIT_SHA").unwrap_or("unknown"); + gauge!( + "brightstaff_build_info", + "version" => version.to_string(), + "git_sha" => git_sha.to_string(), + ) + .set(1.0); +} + +/// Split a provider-qualified model id like `"openai/gpt-4o"` into +/// `(provider, model)`. Returns `("unknown", raw)` when there is no `/`. +pub fn split_provider_model(full: &str) -> (&str, &str) { + match full.split_once('/') { + Some((p, m)) => (p, m), + None => ("unknown", full), + } +} + +/// Bucket an HTTP status code into `"2xx"` / `"4xx"` / `"5xx"` / `"1xx"` / `"3xx"`. +pub fn status_class(status: u16) -> &'static str { + match status { + 100..=199 => "1xx", + 200..=299 => "2xx", + 300..=399 => "3xx", + 400..=499 => "4xx", + 500..=599 => "5xx", + _ => "other", + } +} + +// --------------------------------------------------------------------------- +// HTTP RED helpers +// --------------------------------------------------------------------------- + +/// RAII guard that increments the in-flight gauge on construction and +/// decrements on drop. Pair with [`HttpTimer`] in the `route()` wrapper so the +/// gauge drops even on error paths. +pub struct InFlightGuard { + handler: &'static str, +} + +impl InFlightGuard { + pub fn new(handler: &'static str) -> Self { + gauge!( + "brightstaff_http_in_flight_requests", + "handler" => handler, + ) + .increment(1.0); + Self { handler } + } +} + +impl Drop for InFlightGuard { + fn drop(&mut self) { + gauge!( + "brightstaff_http_in_flight_requests", + "handler" => self.handler, + ) + .decrement(1.0); + } +} + +/// Record the HTTP request counter + duration histogram. +pub fn record_http(handler: &'static str, method: &'static str, status: u16, started: Instant) { + let class = status_class(status); + counter!( + "brightstaff_http_requests_total", + "handler" => handler, + "method" => method, + "status_class" => class, + ) + .increment(1); + histogram!( + "brightstaff_http_request_duration_seconds", + "handler" => handler, + ) + .record(started.elapsed().as_secs_f64()); +} + +// --------------------------------------------------------------------------- +// LLM upstream helpers +// --------------------------------------------------------------------------- + +/// Classify an outcome of an LLM upstream call for the `error_class` label. +pub fn llm_error_class_from_reqwest(err: &reqwest::Error) -> &'static str { + if err.is_timeout() { + "timeout" + } else if err.is_connect() { + "connect" + } else if err.is_decode() { + "parse" + } else { + "other" + } +} + +/// Record the outcome of an LLM upstream call. `status` is the HTTP status +/// the upstream returned (0 if the call never produced one, e.g. send failure). +/// `error_class` is `"none"` on success, or a discriminated error label. +pub fn record_llm_upstream( + provider: &str, + model: &str, + status: u16, + error_class: &str, + duration: Duration, +) { + let class = if status == 0 { + "error" + } else { + status_class(status) + }; + counter!( + "brightstaff_llm_upstream_requests_total", + "provider" => provider.to_string(), + "model" => model.to_string(), + "status_class" => class, + "error_class" => error_class.to_string(), + ) + .increment(1); + histogram!( + "brightstaff_llm_upstream_duration_seconds", + "provider" => provider.to_string(), + "model" => model.to_string(), + ) + .record(duration.as_secs_f64()); +} + +pub fn record_llm_ttft(provider: &str, model: &str, ttft: Duration) { + histogram!( + "brightstaff_llm_time_to_first_token_seconds", + "provider" => provider.to_string(), + "model" => model.to_string(), + ) + .record(ttft.as_secs_f64()); +} + +pub fn record_llm_tokens(provider: &str, model: &str, kind: &'static str, count: u64) { + counter!( + "brightstaff_llm_tokens_total", + "provider" => provider.to_string(), + "model" => model.to_string(), + "kind" => kind, + ) + .increment(count); +} + +pub fn record_llm_tokens_usage_missing(provider: &str, model: &str) { + counter!( + "brightstaff_llm_tokens_usage_missing_total", + "provider" => provider.to_string(), + "model" => model.to_string(), + ) + .increment(1); +} + +// --------------------------------------------------------------------------- +// Router helpers +// --------------------------------------------------------------------------- + +pub fn record_router_decision( + route: &'static str, + selected_model: &str, + fallback: bool, + duration: Duration, +) { + counter!( + "brightstaff_router_decisions_total", + "route" => route, + "selected_model" => selected_model.to_string(), + "fallback" => if fallback { "true" } else { "false" }, + ) + .increment(1); + histogram!( + "brightstaff_router_decision_duration_seconds", + "route" => route, + ) + .record(duration.as_secs_f64()); +} + +pub fn record_routing_service_outcome(outcome: &'static str) { + counter!( + "brightstaff_routing_service_requests_total", + "outcome" => outcome, + ) + .increment(1); +} + +pub fn record_session_cache_event(outcome: &'static str) { + counter!( + "brightstaff_session_cache_events_total", + "outcome" => outcome, + ) + .increment(1); +} diff --git a/crates/brightstaff/src/router/orchestrator.rs b/crates/brightstaff/src/router/orchestrator.rs index 7aaf70a2..2d7b25de 100644 --- a/crates/brightstaff/src/router/orchestrator.rs +++ b/crates/brightstaff/src/router/orchestrator.rs @@ -15,6 +15,8 @@ use super::http::{self, post_and_extract_content}; use super::model_metrics::ModelMetricsService; use super::orchestrator_model::OrchestratorModel; +use crate::metrics as bs_metrics; +use crate::metrics::labels as metric_labels; use crate::router::orchestrator_model_v1; use crate::session_cache::SessionCache; @@ -130,7 +132,13 @@ impl OrchestratorService { tenant_id: Option<&str>, ) -> Option { let cache = self.session_cache.as_ref()?; - cache.get(&Self::session_key(tenant_id, session_id)).await + let result = cache.get(&Self::session_key(tenant_id, session_id)).await; + bs_metrics::record_session_cache_event(if result.is_some() { + metric_labels::SESSION_CACHE_HIT + } else { + metric_labels::SESSION_CACHE_MISS + }); + result } pub async fn cache_route( @@ -151,6 +159,7 @@ impl OrchestratorService { self.session_ttl, ) .await; + bs_metrics::record_session_cache_event(metric_labels::SESSION_CACHE_STORE); } } diff --git a/crates/brightstaff/src/streaming.rs b/crates/brightstaff/src/streaming.rs index 40cbbe7c..8a0f414b 100644 --- a/crates/brightstaff/src/streaming.rs +++ b/crates/brightstaff/src/streaming.rs @@ -20,6 +20,8 @@ const STREAM_BUFFER_SIZE: usize = 16; /// Most chat responses are well under this; pathological ones are dropped without /// affecting pass-through streaming to the client. const USAGE_BUFFER_MAX: usize = 2 * 1024 * 1024; +use crate::metrics as bs_metrics; +use crate::metrics::labels as metric_labels; use crate::signals::{InteractionQuality, SignalAnalyzer, TextBasedSignalAnalyzer, FLAG_MARKER}; use crate::tracing::{llm, set_service_name, signals as signal_constants}; use hermesllm::apis::openai::Message; @@ -172,6 +174,18 @@ impl StreamProcessor for Box { } } +/// Optional Prometheus-metric context for an LLM upstream call. When present, +/// [`ObservableStreamProcessor`] emits `brightstaff_llm_*` metrics at +/// first-byte / complete / error callbacks. +#[derive(Debug, Clone)] +pub struct LlmMetricsCtx { + pub provider: String, + pub model: String, + /// HTTP status of the upstream response. Used to pick `status_class` and + /// `error_class` on `on_complete`. + pub upstream_status: u16, +} + /// A processor that tracks streaming metrics pub struct ObservableStreamProcessor { service_name: String, @@ -185,6 +199,8 @@ pub struct ObservableStreamProcessor { /// on `on_complete`. Capped at `USAGE_BUFFER_MAX`; excess chunks are dropped /// from the buffer (they still pass through to the client). response_buffer: Vec, + llm_metrics: Option, + metrics_recorded: bool, } impl ObservableStreamProcessor { @@ -219,8 +235,17 @@ impl ObservableStreamProcessor { time_to_first_token: None, messages, response_buffer: Vec::new(), + llm_metrics: None, + metrics_recorded: false, } } + + /// Attach LLM upstream metric context so the processor emits + /// `brightstaff_llm_*` metrics on first-byte / complete / error. + pub fn with_llm_metrics(mut self, ctx: LlmMetricsCtx) -> Self { + self.llm_metrics = Some(ctx); + self + } } impl StreamProcessor for ObservableStreamProcessor { @@ -240,7 +265,11 @@ impl StreamProcessor for ObservableStreamProcessor { fn on_first_bytes(&mut self) { // Record time to first token (only for streaming) if self.time_to_first_token.is_none() { - self.time_to_first_token = Some(self.start_time.elapsed().as_millis()); + let elapsed = self.start_time.elapsed(); + self.time_to_first_token = Some(elapsed.as_millis()); + if let Some(ref ctx) = self.llm_metrics { + bs_metrics::record_llm_ttft(&ctx.provider, &ctx.model, elapsed); + } } } @@ -299,6 +328,39 @@ impl StreamProcessor for ObservableStreamProcessor { otel_span.set_attribute(KeyValue::new(llm::MODEL_NAME, resolved)); } } + + // Emit LLM upstream prometheus metrics (duration + tokens) if wired. + // The upstream responded (we have a status), so status_class alone + // carries the non-2xx signal — error_class stays "none". + if let Some(ref ctx) = self.llm_metrics { + bs_metrics::record_llm_upstream( + &ctx.provider, + &ctx.model, + ctx.upstream_status, + metric_labels::LLM_ERR_NONE, + self.start_time.elapsed(), + ); + if let Some(v) = usage.prompt_tokens { + bs_metrics::record_llm_tokens( + &ctx.provider, + &ctx.model, + metric_labels::TOKEN_KIND_PROMPT, + v.max(0) as u64, + ); + } + if let Some(v) = usage.completion_tokens { + bs_metrics::record_llm_tokens( + &ctx.provider, + &ctx.model, + metric_labels::TOKEN_KIND_COMPLETION, + v.max(0) as u64, + ); + } + if usage.prompt_tokens.is_none() && usage.completion_tokens.is_none() { + bs_metrics::record_llm_tokens_usage_missing(&ctx.provider, &ctx.model); + } + self.metrics_recorded = true; + } // Release the buffered bytes early; nothing downstream needs them. self.response_buffer.clear(); self.response_buffer.shrink_to_fit(); @@ -396,6 +458,18 @@ impl StreamProcessor for ObservableStreamProcessor { duration_ms = self.start_time.elapsed().as_millis(), "stream error" ); + if let Some(ref ctx) = self.llm_metrics { + if !self.metrics_recorded { + bs_metrics::record_llm_upstream( + &ctx.provider, + &ctx.model, + ctx.upstream_status, + metric_labels::LLM_ERR_STREAM, + self.start_time.elapsed(), + ); + self.metrics_recorded = true; + } + } } } diff --git a/docs/source/guides/observability/monitoring.rst b/docs/source/guides/observability/monitoring.rst index 736e0a64..d28d25ca 100644 --- a/docs/source/guides/observability/monitoring.rst +++ b/docs/source/guides/observability/monitoring.rst @@ -75,3 +75,54 @@ are some sample configuration files for both, respectively. isDefault: true access: proxy editable: true + +Brightstaff metrics +~~~~~~~~~~~~~~~~~~~ + +In addition to Envoy's stats on ``:9901``, the brightstaff dataplane +process exposes its own Prometheus endpoint on ``0.0.0.0:9092`` (override +with ``METRICS_BIND_ADDRESS``). It publishes: + +* HTTP RED — ``brightstaff_http_requests_total``, + ``brightstaff_http_request_duration_seconds``, + ``brightstaff_http_in_flight_requests`` (labels: ``handler``, ``method``, + ``status_class``). +* LLM upstream — ``brightstaff_llm_upstream_requests_total``, + ``brightstaff_llm_upstream_duration_seconds``, + ``brightstaff_llm_time_to_first_token_seconds``, + ``brightstaff_llm_tokens_total`` (labels: ``provider``, ``model``, + ``error_class``, ``kind``). +* Routing — ``brightstaff_router_decisions_total``, + ``brightstaff_router_decision_duration_seconds``, + ``brightstaff_routing_service_requests_total``, + ``brightstaff_session_cache_events_total``. +* Process & build — ``process_resident_memory_bytes``, + ``process_cpu_seconds_total``, ``brightstaff_build_info``. + +A self-contained Prometheus + Grafana stack is shipped under +``config/grafana/``. With Plano already running on the host, bring it up +with one command: + +.. code-block:: bash + + cd config/grafana + docker compose up -d + open http://localhost:3000 # admin / admin (anonymous viewer also enabled) + +Grafana auto-loads the Prometheus datasource and the brightstaff +dashboard (look under the *Plano* folder). Prometheus scrapes the host's +``:9092`` and ``:9901`` via ``host.docker.internal``. + +Files: + +* ``config/grafana/docker-compose.yaml`` — one-command Prom + Grafana + stack with provisioning. +* ``config/grafana/prometheus_scrape.yaml`` — complete Prometheus config + with ``envoy`` and ``brightstaff`` scrape jobs (mounted by the + compose). +* ``config/grafana/brightstaff_dashboard.json`` — 19-panel dashboard + across HTTP RED, LLM upstream, Routing service, and Process & Envoy + link rows. Auto-provisioned by the compose; can also be imported by + hand via *Dashboards → New → Import*. +* ``config/grafana/provisioning/`` — Grafana provisioning files for the + datasource and dashboard provider.