mirror of
https://github.com/trustgraph-ai/trustgraph.git
synced 2026-07-01 09:29:38 +02:00
init
This commit is contained in:
parent
c386f68743
commit
b6536eca38
100 changed files with 17680 additions and 377 deletions
10
ts/deploy/.env.example
Normal file
10
ts/deploy/.env.example
Normal file
|
|
@ -0,0 +1,10 @@
|
|||
# LLM API Keys
|
||||
OPENAI_TOKEN=
|
||||
CLAUDE_KEY=
|
||||
|
||||
# Gateway
|
||||
GATEWAY_SECRET=
|
||||
GATEWAY_PORT=8088
|
||||
|
||||
# Grafana
|
||||
GF_SECURITY_ADMIN_PASSWORD=admin
|
||||
52
ts/deploy/docker-compose.dev.yml
Normal file
52
ts/deploy/docker-compose.dev.yml
Normal file
|
|
@ -0,0 +1,52 @@
|
|||
# TrustGraph TypeScript — Dev Overrides
|
||||
# Usage: docker compose -f docker-compose.yml -f docker-compose.dev.yml up -d
|
||||
|
||||
services:
|
||||
# Live-edit dashboards without rebuilding
|
||||
grafana:
|
||||
volumes:
|
||||
- ./grafana/provisioning/datasources.yml:/etc/grafana/provisioning/datasources/datasources.yml:ro
|
||||
- ./grafana/provisioning/dashboards.yml:/etc/grafana/provisioning/dashboards/dashboards.yml:ro
|
||||
- ./grafana/dashboards:/var/lib/grafana/dashboards
|
||||
environment:
|
||||
- GF_SECURITY_ADMIN_PASSWORD=admin
|
||||
- GF_AUTH_ANONYMOUS_ENABLED=true
|
||||
- GF_AUTH_ANONYMOUS_ORG_ROLE=Admin
|
||||
- GF_AUTH_DISABLE_LOGIN_FORM=true
|
||||
- GF_USERS_DEFAULT_THEME=dark
|
||||
- GF_EXPLORE_ENABLED=true
|
||||
- GF_FEATURE_TOGGLES_ENABLE=traceqlEditor tempoSearch tempoServiceGraph
|
||||
|
||||
# Prometheus config live reload
|
||||
prometheus:
|
||||
volumes:
|
||||
- ./prometheus/prometheus.yml:/etc/prometheus/prometheus.yml
|
||||
- prometheus-data:/prometheus
|
||||
|
||||
# Loki config live reload
|
||||
loki:
|
||||
volumes:
|
||||
- ./loki/loki-config.yml:/etc/loki/local-config.yaml
|
||||
- loki-data:/tmp/loki
|
||||
|
||||
# NATS CLI tools for debugging
|
||||
nats-cli:
|
||||
image: natsio/nats-box:latest
|
||||
networks:
|
||||
- trustgraph
|
||||
environment:
|
||||
- NATS_URL=nats://nats:4222
|
||||
entrypoint: ["/bin/sh", "-c", "echo 'NATS Box ready. Use: docker compose exec nats-cli nats ...' && sleep infinity"]
|
||||
depends_on:
|
||||
nats:
|
||||
condition: service_healthy
|
||||
profiles:
|
||||
- debug
|
||||
|
||||
volumes:
|
||||
prometheus-data:
|
||||
loki-data:
|
||||
|
||||
networks:
|
||||
trustgraph:
|
||||
driver: bridge
|
||||
276
ts/deploy/docker-compose.yml
Normal file
276
ts/deploy/docker-compose.yml
Normal file
|
|
@ -0,0 +1,276 @@
|
|||
# TrustGraph TypeScript — Full Stack
|
||||
# Usage: docker compose up -d
|
||||
# Observability UI: http://localhost:3000 (Grafana)
|
||||
|
||||
networks:
|
||||
trustgraph:
|
||||
driver: bridge
|
||||
|
||||
volumes:
|
||||
nats-data:
|
||||
falkordb-data:
|
||||
qdrant-data:
|
||||
ollama-models:
|
||||
prometheus-data:
|
||||
loki-data:
|
||||
tempo-data:
|
||||
grafana-data:
|
||||
|
||||
services:
|
||||
# ---------------------------------------------------------------------------
|
||||
# Infrastructure
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
nats:
|
||||
image: nats:2.10-alpine
|
||||
command: ["--jetstream", "--http_port", "8222", "--store_dir", "/data"]
|
||||
ports:
|
||||
- "4222:4222" # Client connections
|
||||
- "8222:8222" # Monitoring / metrics
|
||||
volumes:
|
||||
- nats-data:/data
|
||||
networks:
|
||||
- trustgraph
|
||||
healthcheck:
|
||||
test: ["CMD", "wget", "--spider", "-q", "http://localhost:8222/healthz"]
|
||||
interval: 10s
|
||||
timeout: 5s
|
||||
retries: 3
|
||||
start_period: 5s
|
||||
restart: unless-stopped
|
||||
|
||||
falkordb:
|
||||
image: falkordb/falkordb:latest
|
||||
ports:
|
||||
- "6379:6379"
|
||||
volumes:
|
||||
- falkordb-data:/data
|
||||
networks:
|
||||
- trustgraph
|
||||
healthcheck:
|
||||
test: ["CMD", "redis-cli", "ping"]
|
||||
interval: 10s
|
||||
timeout: 5s
|
||||
retries: 3
|
||||
start_period: 5s
|
||||
restart: unless-stopped
|
||||
|
||||
qdrant:
|
||||
image: qdrant/qdrant:latest
|
||||
ports:
|
||||
- "6333:6333" # REST API
|
||||
- "6334:6334" # gRPC
|
||||
volumes:
|
||||
- qdrant-data:/qdrant/storage
|
||||
networks:
|
||||
- trustgraph
|
||||
healthcheck:
|
||||
test: ["CMD", "wget", "--spider", "-q", "http://localhost:6333/healthz"]
|
||||
interval: 10s
|
||||
timeout: 5s
|
||||
retries: 3
|
||||
start_period: 5s
|
||||
restart: unless-stopped
|
||||
|
||||
ollama:
|
||||
image: ollama/ollama:latest
|
||||
ports:
|
||||
- "11434:11434"
|
||||
volumes:
|
||||
- ollama-models:/root/.ollama
|
||||
networks:
|
||||
- trustgraph
|
||||
restart: unless-stopped
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Observability
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
prometheus:
|
||||
image: prom/prometheus:latest
|
||||
ports:
|
||||
- "9090:9090"
|
||||
volumes:
|
||||
- ./prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro
|
||||
- prometheus-data:/prometheus
|
||||
command:
|
||||
- "--config.file=/etc/prometheus/prometheus.yml"
|
||||
- "--storage.tsdb.path=/prometheus"
|
||||
- "--storage.tsdb.retention.time=7d"
|
||||
- "--web.enable-remote-write-receiver"
|
||||
- "--enable-feature=exemplar-storage"
|
||||
networks:
|
||||
- trustgraph
|
||||
healthcheck:
|
||||
test: ["CMD", "wget", "--spider", "-q", "http://localhost:9090/-/healthy"]
|
||||
interval: 10s
|
||||
timeout: 5s
|
||||
retries: 3
|
||||
start_period: 10s
|
||||
restart: unless-stopped
|
||||
|
||||
loki:
|
||||
image: grafana/loki:3.0.0
|
||||
ports:
|
||||
- "3100:3100"
|
||||
volumes:
|
||||
- ./loki/loki-config.yml:/etc/loki/local-config.yaml:ro
|
||||
- loki-data:/tmp/loki
|
||||
command: ["-config.file=/etc/loki/local-config.yaml"]
|
||||
networks:
|
||||
- trustgraph
|
||||
healthcheck:
|
||||
test: ["CMD", "wget", "--spider", "-q", "http://localhost:3100/ready"]
|
||||
interval: 10s
|
||||
timeout: 5s
|
||||
retries: 5
|
||||
start_period: 15s
|
||||
restart: unless-stopped
|
||||
|
||||
tempo:
|
||||
image: grafana/tempo:latest
|
||||
ports:
|
||||
- "3200:3200" # Tempo API
|
||||
volumes:
|
||||
- ./tempo/tempo-config.yml:/etc/tempo/config.yml:ro
|
||||
- tempo-data:/tmp/tempo
|
||||
command: ["-config.file=/etc/tempo/config.yml"]
|
||||
networks:
|
||||
- trustgraph
|
||||
healthcheck:
|
||||
test: ["CMD", "wget", "--spider", "-q", "http://localhost:3200/ready"]
|
||||
interval: 10s
|
||||
timeout: 5s
|
||||
retries: 5
|
||||
start_period: 15s
|
||||
restart: unless-stopped
|
||||
|
||||
otel-collector:
|
||||
image: otel/opentelemetry-collector-contrib:latest
|
||||
ports:
|
||||
- "4317:4317" # OTLP gRPC (apps send traces/metrics here)
|
||||
- "4318:4318" # OTLP HTTP
|
||||
- "8889:8889" # Prometheus exporter (scraped by Prometheus)
|
||||
volumes:
|
||||
- ./otel-collector/config.yml:/etc/otelcol-contrib/config.yaml:ro
|
||||
depends_on:
|
||||
tempo:
|
||||
condition: service_healthy
|
||||
networks:
|
||||
- trustgraph
|
||||
restart: unless-stopped
|
||||
|
||||
grafana:
|
||||
image: grafana/grafana:latest
|
||||
ports:
|
||||
- "3000:3000"
|
||||
volumes:
|
||||
- ./grafana/provisioning/datasources.yml:/etc/grafana/provisioning/datasources/datasources.yml:ro
|
||||
- ./grafana/provisioning/dashboards.yml:/etc/grafana/provisioning/dashboards/dashboards.yml:ro
|
||||
- ./grafana/dashboards:/var/lib/grafana/dashboards:ro
|
||||
- grafana-data:/var/lib/grafana
|
||||
environment:
|
||||
- GF_SECURITY_ADMIN_PASSWORD=${GF_SECURITY_ADMIN_PASSWORD:-admin}
|
||||
- GF_AUTH_ANONYMOUS_ENABLED=true
|
||||
- GF_AUTH_ANONYMOUS_ORG_ROLE=Viewer
|
||||
- GF_AUTH_DISABLE_LOGIN_FORM=false
|
||||
- GF_USERS_DEFAULT_THEME=dark
|
||||
- GF_EXPLORE_ENABLED=true
|
||||
- GF_FEATURE_TOGGLES_ENABLE=traceqlEditor tempoSearch tempoServiceGraph
|
||||
depends_on:
|
||||
prometheus:
|
||||
condition: service_healthy
|
||||
loki:
|
||||
condition: service_healthy
|
||||
tempo:
|
||||
condition: service_healthy
|
||||
networks:
|
||||
- trustgraph
|
||||
healthcheck:
|
||||
test: ["CMD", "wget", "--spider", "-q", "http://localhost:3000/api/health"]
|
||||
interval: 10s
|
||||
timeout: 5s
|
||||
retries: 3
|
||||
start_period: 15s
|
||||
restart: unless-stopped
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# TrustGraph Services (placeholders — will be filled in later)
|
||||
# ---------------------------------------------------------------------------
|
||||
#
|
||||
# gateway:
|
||||
# build:
|
||||
# context: ../
|
||||
# dockerfile: packages/base/Dockerfile
|
||||
# target: gateway
|
||||
# ports:
|
||||
# - "${GATEWAY_PORT:-8088}:8000"
|
||||
# environment:
|
||||
# - NATS_URL=nats://nats:4222
|
||||
# - FALKORDB_URL=redis://falkordb:6379
|
||||
# - QDRANT_URL=http://qdrant:6333
|
||||
# - OPENAI_TOKEN=${OPENAI_TOKEN}
|
||||
# - CLAUDE_KEY=${CLAUDE_KEY}
|
||||
# - GATEWAY_SECRET=${GATEWAY_SECRET}
|
||||
# - OTEL_EXPORTER_OTLP_ENDPOINT=http://otel-collector:4317
|
||||
# - OTEL_SERVICE_NAME=gateway
|
||||
# depends_on:
|
||||
# nats:
|
||||
# condition: service_healthy
|
||||
# falkordb:
|
||||
# condition: service_healthy
|
||||
# qdrant:
|
||||
# condition: service_healthy
|
||||
# networks:
|
||||
# - trustgraph
|
||||
#
|
||||
# text-completion:
|
||||
# build:
|
||||
# context: ../
|
||||
# dockerfile: packages/base/Dockerfile
|
||||
# target: text-completion
|
||||
# environment:
|
||||
# - NATS_URL=nats://nats:4222
|
||||
# - OPENAI_TOKEN=${OPENAI_TOKEN}
|
||||
# - CLAUDE_KEY=${CLAUDE_KEY}
|
||||
# - OTEL_EXPORTER_OTLP_ENDPOINT=http://otel-collector:4317
|
||||
# - OTEL_SERVICE_NAME=text-completion
|
||||
# depends_on:
|
||||
# nats:
|
||||
# condition: service_healthy
|
||||
# networks:
|
||||
# - trustgraph
|
||||
#
|
||||
# graph-rag:
|
||||
# build:
|
||||
# context: ../
|
||||
# dockerfile: packages/base/Dockerfile
|
||||
# target: graph-rag
|
||||
# environment:
|
||||
# - NATS_URL=nats://nats:4222
|
||||
# - FALKORDB_URL=redis://falkordb:6379
|
||||
# - QDRANT_URL=http://qdrant:6333
|
||||
# - OTEL_EXPORTER_OTLP_ENDPOINT=http://otel-collector:4317
|
||||
# - OTEL_SERVICE_NAME=graph-rag
|
||||
# depends_on:
|
||||
# nats:
|
||||
# condition: service_healthy
|
||||
# falkordb:
|
||||
# condition: service_healthy
|
||||
# qdrant:
|
||||
# condition: service_healthy
|
||||
# networks:
|
||||
# - trustgraph
|
||||
#
|
||||
# workbench:
|
||||
# build:
|
||||
# context: ../
|
||||
# dockerfile: packages/workbench/Dockerfile
|
||||
# ports:
|
||||
# - "3001:3000"
|
||||
# environment:
|
||||
# - GATEWAY_URL=http://gateway:8000
|
||||
# depends_on:
|
||||
# - gateway
|
||||
# networks:
|
||||
# - trustgraph
|
||||
317
ts/deploy/grafana/dashboards/llm-metrics.json
Normal file
317
ts/deploy/grafana/dashboards/llm-metrics.json
Normal file
|
|
@ -0,0 +1,317 @@
|
|||
{
|
||||
"annotations": {
|
||||
"list": [
|
||||
{
|
||||
"builtIn": 1,
|
||||
"datasource": {
|
||||
"type": "grafana",
|
||||
"uid": "-- Grafana --"
|
||||
},
|
||||
"enable": true,
|
||||
"hide": true,
|
||||
"iconColor": "rgba(0, 211, 255, 1)",
|
||||
"name": "Annotations & Alerts",
|
||||
"type": "dashboard"
|
||||
}
|
||||
]
|
||||
},
|
||||
"editable": true,
|
||||
"fiscalYearStartMonth": 0,
|
||||
"graphTooltip": 1,
|
||||
"id": null,
|
||||
"links": [],
|
||||
"panels": [
|
||||
{
|
||||
"title": "LLM Request Latency by Provider",
|
||||
"type": "timeseries",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "tg-prometheus"
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 0 },
|
||||
"id": 1,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "histogram_quantile(0.50, sum(rate(tg_consumer_request_duration_seconds_bucket{job=~\".*text-completion.*\"}[5m])) by (le, job))",
|
||||
"legendFormat": "{{job}} p50",
|
||||
"refId": "A"
|
||||
},
|
||||
{
|
||||
"expr": "histogram_quantile(0.95, sum(rate(tg_consumer_request_duration_seconds_bucket{job=~\".*text-completion.*\"}[5m])) by (le, job))",
|
||||
"legendFormat": "{{job}} p95",
|
||||
"refId": "B"
|
||||
},
|
||||
{
|
||||
"expr": "histogram_quantile(0.99, sum(rate(tg_consumer_request_duration_seconds_bucket{job=~\".*text-completion.*\"}[5m])) by (le, job))",
|
||||
"legendFormat": "{{job}} p99",
|
||||
"refId": "C"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": {
|
||||
"axisBorderShow": false,
|
||||
"axisCenteredZero": false,
|
||||
"axisColorMode": "text",
|
||||
"axisLabel": "latency",
|
||||
"axisPlacement": "auto",
|
||||
"drawStyle": "line",
|
||||
"fillOpacity": 5,
|
||||
"gradientMode": "scheme",
|
||||
"lineInterpolation": "smooth",
|
||||
"lineWidth": 2,
|
||||
"pointSize": 5,
|
||||
"scaleDistribution": { "type": "linear" },
|
||||
"showPoints": "never",
|
||||
"spanNulls": false,
|
||||
"stacking": { "group": "A", "mode": "none" },
|
||||
"thresholdsStyle": { "mode": "off" }
|
||||
},
|
||||
"unit": "s",
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 5 },
|
||||
{ "color": "red", "value": 30 }
|
||||
]
|
||||
}
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"options": {
|
||||
"legend": { "calcs": ["mean", "max"], "displayMode": "table", "placement": "bottom", "showLegend": true },
|
||||
"tooltip": { "mode": "multi", "sort": "desc" }
|
||||
}
|
||||
},
|
||||
{
|
||||
"title": "Token Usage (Input vs Output)",
|
||||
"type": "timeseries",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "tg-prometheus"
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 0 },
|
||||
"id": 2,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(tg_llm_input_tokens_total[5m])) by (job)",
|
||||
"legendFormat": "{{job}} input tokens/s",
|
||||
"refId": "A"
|
||||
},
|
||||
{
|
||||
"expr": "sum(rate(tg_llm_output_tokens_total[5m])) by (job)",
|
||||
"legendFormat": "{{job}} output tokens/s",
|
||||
"refId": "B"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": {
|
||||
"axisBorderShow": false,
|
||||
"axisCenteredZero": false,
|
||||
"axisColorMode": "text",
|
||||
"axisLabel": "tokens/s",
|
||||
"axisPlacement": "auto",
|
||||
"drawStyle": "line",
|
||||
"fillOpacity": 20,
|
||||
"gradientMode": "none",
|
||||
"lineInterpolation": "smooth",
|
||||
"lineWidth": 2,
|
||||
"pointSize": 5,
|
||||
"scaleDistribution": { "type": "linear" },
|
||||
"showPoints": "never",
|
||||
"spanNulls": false,
|
||||
"stacking": { "group": "A", "mode": "normal" },
|
||||
"thresholdsStyle": { "mode": "off" }
|
||||
},
|
||||
"unit": "short",
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null }
|
||||
]
|
||||
}
|
||||
},
|
||||
"overrides": [
|
||||
{
|
||||
"matcher": { "id": "byRegexp", "options": ".*input.*" },
|
||||
"properties": [
|
||||
{ "id": "color", "value": { "fixedColor": "blue", "mode": "fixed" } }
|
||||
]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byRegexp", "options": ".*output.*" },
|
||||
"properties": [
|
||||
{ "id": "color", "value": { "fixedColor": "orange", "mode": "fixed" } }
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"legend": { "calcs": ["mean", "sum"], "displayMode": "table", "placement": "bottom", "showLegend": true },
|
||||
"tooltip": { "mode": "multi", "sort": "desc" }
|
||||
}
|
||||
},
|
||||
{
|
||||
"title": "Rate Limit Events",
|
||||
"type": "timeseries",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "tg-prometheus"
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 8 },
|
||||
"id": 3,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(tg_consumer_rate_limit_total[5m])) by (job)",
|
||||
"legendFormat": "{{job}} rate limits/s",
|
||||
"refId": "A"
|
||||
},
|
||||
{
|
||||
"expr": "sum(increase(tg_consumer_rate_limit_total[1h])) by (job)",
|
||||
"legendFormat": "{{job}} total (1h)",
|
||||
"refId": "B"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": {
|
||||
"axisBorderShow": false,
|
||||
"axisCenteredZero": false,
|
||||
"axisColorMode": "text",
|
||||
"axisLabel": "",
|
||||
"axisPlacement": "auto",
|
||||
"drawStyle": "bars",
|
||||
"fillOpacity": 50,
|
||||
"gradientMode": "none",
|
||||
"lineInterpolation": "smooth",
|
||||
"lineWidth": 1,
|
||||
"pointSize": 5,
|
||||
"scaleDistribution": { "type": "linear" },
|
||||
"showPoints": "never",
|
||||
"spanNulls": false,
|
||||
"stacking": { "group": "A", "mode": "none" },
|
||||
"thresholdsStyle": { "mode": "line" }
|
||||
},
|
||||
"unit": "short",
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "red", "value": 1 }
|
||||
]
|
||||
}
|
||||
},
|
||||
"overrides": [
|
||||
{
|
||||
"matcher": { "id": "byRegexp", "options": ".*total.*" },
|
||||
"properties": [
|
||||
{ "id": "custom.drawStyle", "value": "line" },
|
||||
{ "id": "custom.axisPlacement", "value": "right" },
|
||||
{ "id": "custom.fillOpacity", "value": 0 },
|
||||
{ "id": "custom.lineWidth", "value": 2 }
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"legend": { "calcs": ["sum", "max"], "displayMode": "table", "placement": "bottom", "showLegend": true },
|
||||
"tooltip": { "mode": "multi", "sort": "desc" }
|
||||
}
|
||||
},
|
||||
{
|
||||
"title": "Streaming Chunk Latency",
|
||||
"type": "timeseries",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "tg-prometheus"
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 8 },
|
||||
"id": 4,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "histogram_quantile(0.50, sum(rate(tg_llm_stream_chunk_duration_seconds_bucket[5m])) by (le, job))",
|
||||
"legendFormat": "{{job}} chunk p50",
|
||||
"refId": "A"
|
||||
},
|
||||
{
|
||||
"expr": "histogram_quantile(0.95, sum(rate(tg_llm_stream_chunk_duration_seconds_bucket[5m])) by (le, job))",
|
||||
"legendFormat": "{{job}} chunk p95",
|
||||
"refId": "B"
|
||||
},
|
||||
{
|
||||
"expr": "histogram_quantile(0.50, sum(rate(tg_llm_time_to_first_token_seconds_bucket[5m])) by (le, job))",
|
||||
"legendFormat": "{{job}} TTFT p50",
|
||||
"refId": "C"
|
||||
},
|
||||
{
|
||||
"expr": "histogram_quantile(0.95, sum(rate(tg_llm_time_to_first_token_seconds_bucket[5m])) by (le, job))",
|
||||
"legendFormat": "{{job}} TTFT p95",
|
||||
"refId": "D"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": {
|
||||
"axisBorderShow": false,
|
||||
"axisCenteredZero": false,
|
||||
"axisColorMode": "text",
|
||||
"axisLabel": "latency",
|
||||
"axisPlacement": "auto",
|
||||
"drawStyle": "line",
|
||||
"fillOpacity": 5,
|
||||
"gradientMode": "none",
|
||||
"lineInterpolation": "smooth",
|
||||
"lineWidth": 2,
|
||||
"pointSize": 5,
|
||||
"scaleDistribution": { "type": "linear" },
|
||||
"showPoints": "never",
|
||||
"spanNulls": false,
|
||||
"stacking": { "group": "A", "mode": "none" },
|
||||
"thresholdsStyle": { "mode": "off" }
|
||||
},
|
||||
"unit": "s",
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 0.5 },
|
||||
{ "color": "red", "value": 2 }
|
||||
]
|
||||
}
|
||||
},
|
||||
"overrides": [
|
||||
{
|
||||
"matcher": { "id": "byRegexp", "options": ".*TTFT.*" },
|
||||
"properties": [
|
||||
{ "id": "custom.lineStyle", "value": { "fill": "dash", "dash": [10, 10] } }
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"legend": { "calcs": ["mean", "max"], "displayMode": "table", "placement": "bottom", "showLegend": true },
|
||||
"tooltip": { "mode": "multi", "sort": "desc" }
|
||||
}
|
||||
}
|
||||
],
|
||||
"schemaVersion": 39,
|
||||
"tags": ["trustgraph", "llm"],
|
||||
"templating": {
|
||||
"list": []
|
||||
},
|
||||
"time": {
|
||||
"from": "now-1h",
|
||||
"to": "now"
|
||||
},
|
||||
"timepicker": {},
|
||||
"timezone": "browser",
|
||||
"title": "TrustGraph - LLM Performance",
|
||||
"uid": "tg-llm-metrics",
|
||||
"version": 1
|
||||
}
|
||||
275
ts/deploy/grafana/dashboards/overview.json
Normal file
275
ts/deploy/grafana/dashboards/overview.json
Normal file
|
|
@ -0,0 +1,275 @@
|
|||
{
|
||||
"annotations": {
|
||||
"list": [
|
||||
{
|
||||
"builtIn": 1,
|
||||
"datasource": {
|
||||
"type": "grafana",
|
||||
"uid": "-- Grafana --"
|
||||
},
|
||||
"enable": true,
|
||||
"hide": true,
|
||||
"iconColor": "rgba(0, 211, 255, 1)",
|
||||
"name": "Annotations & Alerts",
|
||||
"type": "dashboard"
|
||||
}
|
||||
]
|
||||
},
|
||||
"editable": true,
|
||||
"fiscalYearStartMonth": 0,
|
||||
"graphTooltip": 1,
|
||||
"id": null,
|
||||
"links": [],
|
||||
"panels": [
|
||||
{
|
||||
"title": "Service Health",
|
||||
"type": "stat",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "tg-prometheus"
|
||||
},
|
||||
"gridPos": { "h": 6, "w": 24, "x": 0, "y": 0 },
|
||||
"id": 1,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "up",
|
||||
"legendFormat": "{{job}}",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"mappings": [
|
||||
{ "type": "value", "options": { "0": { "text": "DOWN", "color": "red" } } },
|
||||
{ "type": "value", "options": { "1": { "text": "UP", "color": "green" } } }
|
||||
],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "red", "value": null },
|
||||
{ "color": "green", "value": 1 }
|
||||
]
|
||||
},
|
||||
"color": { "mode": "thresholds" }
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"options": {
|
||||
"reduceOptions": {
|
||||
"values": false,
|
||||
"calcs": ["lastNotNull"],
|
||||
"fields": ""
|
||||
},
|
||||
"orientation": "auto",
|
||||
"textMode": "auto",
|
||||
"colorMode": "background",
|
||||
"graphMode": "none",
|
||||
"justifyMode": "auto"
|
||||
}
|
||||
},
|
||||
{
|
||||
"title": "NATS Message Throughput",
|
||||
"type": "timeseries",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "tg-prometheus"
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 6 },
|
||||
"id": 2,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "rate(tg_producer_items_total[5m])",
|
||||
"legendFormat": "{{job}} produced",
|
||||
"refId": "A"
|
||||
},
|
||||
{
|
||||
"expr": "rate(tg_consumer_processing_total[5m])",
|
||||
"legendFormat": "{{job}} consumed ({{status}})",
|
||||
"refId": "B"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": {
|
||||
"axisBorderShow": false,
|
||||
"axisCenteredZero": false,
|
||||
"axisColorMode": "text",
|
||||
"axisLabel": "msg/s",
|
||||
"axisPlacement": "auto",
|
||||
"drawStyle": "line",
|
||||
"fillOpacity": 10,
|
||||
"gradientMode": "none",
|
||||
"lineInterpolation": "smooth",
|
||||
"lineWidth": 2,
|
||||
"pointSize": 5,
|
||||
"scaleDistribution": { "type": "linear" },
|
||||
"showPoints": "never",
|
||||
"spanNulls": false,
|
||||
"stacking": { "group": "A", "mode": "none" },
|
||||
"thresholdsStyle": { "mode": "off" }
|
||||
},
|
||||
"unit": "ops",
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null }
|
||||
]
|
||||
}
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"options": {
|
||||
"legend": { "calcs": ["mean", "max"], "displayMode": "table", "placement": "bottom", "showLegend": true },
|
||||
"tooltip": { "mode": "multi", "sort": "desc" }
|
||||
}
|
||||
},
|
||||
{
|
||||
"title": "Request Latency (p50 / p95 / p99)",
|
||||
"type": "timeseries",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "tg-prometheus"
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 6 },
|
||||
"id": 3,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "histogram_quantile(0.50, sum(rate(tg_consumer_request_duration_seconds_bucket[5m])) by (le))",
|
||||
"legendFormat": "p50",
|
||||
"refId": "A"
|
||||
},
|
||||
{
|
||||
"expr": "histogram_quantile(0.95, sum(rate(tg_consumer_request_duration_seconds_bucket[5m])) by (le))",
|
||||
"legendFormat": "p95",
|
||||
"refId": "B"
|
||||
},
|
||||
{
|
||||
"expr": "histogram_quantile(0.99, sum(rate(tg_consumer_request_duration_seconds_bucket[5m])) by (le))",
|
||||
"legendFormat": "p99",
|
||||
"refId": "C"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": {
|
||||
"axisBorderShow": false,
|
||||
"axisCenteredZero": false,
|
||||
"axisColorMode": "text",
|
||||
"axisLabel": "latency",
|
||||
"axisPlacement": "auto",
|
||||
"drawStyle": "line",
|
||||
"fillOpacity": 5,
|
||||
"gradientMode": "scheme",
|
||||
"lineInterpolation": "smooth",
|
||||
"lineWidth": 2,
|
||||
"pointSize": 5,
|
||||
"scaleDistribution": { "type": "linear" },
|
||||
"showPoints": "never",
|
||||
"spanNulls": false,
|
||||
"stacking": { "group": "A", "mode": "none" },
|
||||
"thresholdsStyle": { "mode": "off" }
|
||||
},
|
||||
"unit": "s",
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 1 },
|
||||
{ "color": "red", "value": 5 }
|
||||
]
|
||||
}
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"options": {
|
||||
"legend": { "calcs": ["mean", "max"], "displayMode": "table", "placement": "bottom", "showLegend": true },
|
||||
"tooltip": { "mode": "multi", "sort": "desc" }
|
||||
}
|
||||
},
|
||||
{
|
||||
"title": "Error Rate",
|
||||
"type": "timeseries",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "tg-prometheus"
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 24, "x": 0, "y": 14 },
|
||||
"id": 4,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(tg_consumer_processing_total{status=\"error\"}[5m])) by (job)",
|
||||
"legendFormat": "{{job}} errors/s",
|
||||
"refId": "A"
|
||||
},
|
||||
{
|
||||
"expr": "sum(rate(tg_consumer_processing_total{status=\"error\"}[5m])) / sum(rate(tg_consumer_processing_total[5m]))",
|
||||
"legendFormat": "overall error ratio",
|
||||
"refId": "B"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": {
|
||||
"axisBorderShow": false,
|
||||
"axisCenteredZero": false,
|
||||
"axisColorMode": "text",
|
||||
"axisLabel": "",
|
||||
"axisPlacement": "auto",
|
||||
"drawStyle": "line",
|
||||
"fillOpacity": 15,
|
||||
"gradientMode": "none",
|
||||
"lineInterpolation": "smooth",
|
||||
"lineWidth": 2,
|
||||
"pointSize": 5,
|
||||
"scaleDistribution": { "type": "linear" },
|
||||
"showPoints": "never",
|
||||
"spanNulls": false,
|
||||
"stacking": { "group": "A", "mode": "none" },
|
||||
"thresholdsStyle": { "mode": "line+area" }
|
||||
},
|
||||
"unit": "ops",
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "transparent", "value": null },
|
||||
{ "color": "red", "value": 0.05 }
|
||||
]
|
||||
}
|
||||
},
|
||||
"overrides": [
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "overall error ratio" },
|
||||
"properties": [
|
||||
{ "id": "unit", "value": "percentunit" },
|
||||
{ "id": "custom.axisPlacement", "value": "right" },
|
||||
{ "id": "custom.drawStyle", "value": "line" },
|
||||
{ "id": "custom.lineWidth", "value": 3 },
|
||||
{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"legend": { "calcs": ["mean", "max"], "displayMode": "table", "placement": "bottom", "showLegend": true },
|
||||
"tooltip": { "mode": "multi", "sort": "desc" }
|
||||
}
|
||||
}
|
||||
],
|
||||
"schemaVersion": 39,
|
||||
"tags": ["trustgraph", "overview"],
|
||||
"templating": {
|
||||
"list": []
|
||||
},
|
||||
"time": {
|
||||
"from": "now-1h",
|
||||
"to": "now"
|
||||
},
|
||||
"timepicker": {},
|
||||
"timezone": "browser",
|
||||
"title": "TrustGraph - Service Health",
|
||||
"uid": "tg-overview",
|
||||
"version": 1
|
||||
}
|
||||
404
ts/deploy/grafana/dashboards/rag-pipeline.json
Normal file
404
ts/deploy/grafana/dashboards/rag-pipeline.json
Normal file
|
|
@ -0,0 +1,404 @@
|
|||
{
|
||||
"annotations": {
|
||||
"list": [
|
||||
{
|
||||
"builtIn": 1,
|
||||
"datasource": {
|
||||
"type": "grafana",
|
||||
"uid": "-- Grafana --"
|
||||
},
|
||||
"enable": true,
|
||||
"hide": true,
|
||||
"iconColor": "rgba(0, 211, 255, 1)",
|
||||
"name": "Annotations & Alerts",
|
||||
"type": "dashboard"
|
||||
}
|
||||
]
|
||||
},
|
||||
"editable": true,
|
||||
"fiscalYearStartMonth": 0,
|
||||
"graphTooltip": 1,
|
||||
"id": null,
|
||||
"links": [],
|
||||
"panels": [
|
||||
{
|
||||
"title": "End-to-End RAG Query Latency",
|
||||
"type": "timeseries",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "tg-prometheus"
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 24, "x": 0, "y": 0 },
|
||||
"id": 1,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "histogram_quantile(0.50, sum(rate(tg_consumer_request_duration_seconds_bucket{job=~\"graph-rag|document-rag\"}[5m])) by (le, job))",
|
||||
"legendFormat": "{{job}} p50",
|
||||
"refId": "A"
|
||||
},
|
||||
{
|
||||
"expr": "histogram_quantile(0.95, sum(rate(tg_consumer_request_duration_seconds_bucket{job=~\"graph-rag|document-rag\"}[5m])) by (le, job))",
|
||||
"legendFormat": "{{job}} p95",
|
||||
"refId": "B"
|
||||
},
|
||||
{
|
||||
"expr": "histogram_quantile(0.99, sum(rate(tg_consumer_request_duration_seconds_bucket{job=~\"graph-rag|document-rag\"}[5m])) by (le, job))",
|
||||
"legendFormat": "{{job}} p99",
|
||||
"refId": "C"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": {
|
||||
"axisBorderShow": false,
|
||||
"axisCenteredZero": false,
|
||||
"axisColorMode": "text",
|
||||
"axisLabel": "latency",
|
||||
"axisPlacement": "auto",
|
||||
"drawStyle": "line",
|
||||
"fillOpacity": 10,
|
||||
"gradientMode": "scheme",
|
||||
"lineInterpolation": "smooth",
|
||||
"lineWidth": 2,
|
||||
"pointSize": 5,
|
||||
"scaleDistribution": { "type": "linear" },
|
||||
"showPoints": "never",
|
||||
"spanNulls": false,
|
||||
"stacking": { "group": "A", "mode": "none" },
|
||||
"thresholdsStyle": { "mode": "line" }
|
||||
},
|
||||
"unit": "s",
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 5 },
|
||||
{ "color": "red", "value": 15 }
|
||||
]
|
||||
}
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"options": {
|
||||
"legend": { "calcs": ["mean", "max", "last"], "displayMode": "table", "placement": "bottom", "showLegend": true },
|
||||
"tooltip": { "mode": "multi", "sort": "desc" }
|
||||
}
|
||||
},
|
||||
{
|
||||
"title": "Concept Extraction Time",
|
||||
"type": "timeseries",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "tg-prometheus"
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 8 },
|
||||
"id": 2,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "histogram_quantile(0.50, sum(rate(tg_consumer_request_duration_seconds_bucket{job=~\"kg-extract.*\"}[5m])) by (le, job))",
|
||||
"legendFormat": "{{job}} p50",
|
||||
"refId": "A"
|
||||
},
|
||||
{
|
||||
"expr": "histogram_quantile(0.95, sum(rate(tg_consumer_request_duration_seconds_bucket{job=~\"kg-extract.*\"}[5m])) by (le, job))",
|
||||
"legendFormat": "{{job}} p95",
|
||||
"refId": "B"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": {
|
||||
"axisBorderShow": false,
|
||||
"axisCenteredZero": false,
|
||||
"axisColorMode": "text",
|
||||
"axisLabel": "latency",
|
||||
"axisPlacement": "auto",
|
||||
"drawStyle": "line",
|
||||
"fillOpacity": 10,
|
||||
"gradientMode": "none",
|
||||
"lineInterpolation": "smooth",
|
||||
"lineWidth": 2,
|
||||
"pointSize": 5,
|
||||
"scaleDistribution": { "type": "linear" },
|
||||
"showPoints": "never",
|
||||
"spanNulls": false,
|
||||
"stacking": { "group": "A", "mode": "none" },
|
||||
"thresholdsStyle": { "mode": "off" }
|
||||
},
|
||||
"unit": "s",
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 3 },
|
||||
{ "color": "red", "value": 10 }
|
||||
]
|
||||
}
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"options": {
|
||||
"legend": { "calcs": ["mean", "max"], "displayMode": "table", "placement": "bottom", "showLegend": true },
|
||||
"tooltip": { "mode": "multi", "sort": "desc" }
|
||||
}
|
||||
},
|
||||
{
|
||||
"title": "Embedding Generation Time",
|
||||
"type": "timeseries",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "tg-prometheus"
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 8 },
|
||||
"id": 3,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "histogram_quantile(0.50, sum(rate(tg_consumer_request_duration_seconds_bucket{job=~\"embeddings|document-embeddings|graph-embeddings\"}[5m])) by (le, job))",
|
||||
"legendFormat": "{{job}} p50",
|
||||
"refId": "A"
|
||||
},
|
||||
{
|
||||
"expr": "histogram_quantile(0.95, sum(rate(tg_consumer_request_duration_seconds_bucket{job=~\"embeddings|document-embeddings|graph-embeddings\"}[5m])) by (le, job))",
|
||||
"legendFormat": "{{job}} p95",
|
||||
"refId": "B"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": {
|
||||
"axisBorderShow": false,
|
||||
"axisCenteredZero": false,
|
||||
"axisColorMode": "text",
|
||||
"axisLabel": "latency",
|
||||
"axisPlacement": "auto",
|
||||
"drawStyle": "line",
|
||||
"fillOpacity": 10,
|
||||
"gradientMode": "none",
|
||||
"lineInterpolation": "smooth",
|
||||
"lineWidth": 2,
|
||||
"pointSize": 5,
|
||||
"scaleDistribution": { "type": "linear" },
|
||||
"showPoints": "never",
|
||||
"spanNulls": false,
|
||||
"stacking": { "group": "A", "mode": "none" },
|
||||
"thresholdsStyle": { "mode": "off" }
|
||||
},
|
||||
"unit": "s",
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 1 },
|
||||
{ "color": "red", "value": 5 }
|
||||
]
|
||||
}
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"options": {
|
||||
"legend": { "calcs": ["mean", "max"], "displayMode": "table", "placement": "bottom", "showLegend": true },
|
||||
"tooltip": { "mode": "multi", "sort": "desc" }
|
||||
}
|
||||
},
|
||||
{
|
||||
"title": "Graph Traversal Time",
|
||||
"type": "timeseries",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "tg-prometheus"
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 16 },
|
||||
"id": 4,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "histogram_quantile(0.50, sum(rate(tg_consumer_request_duration_seconds_bucket{job=~\"query-triples|query-graph-embeddings|query-doc-embeddings\"}[5m])) by (le, job))",
|
||||
"legendFormat": "{{job}} p50",
|
||||
"refId": "A"
|
||||
},
|
||||
{
|
||||
"expr": "histogram_quantile(0.95, sum(rate(tg_consumer_request_duration_seconds_bucket{job=~\"query-triples|query-graph-embeddings|query-doc-embeddings\"}[5m])) by (le, job))",
|
||||
"legendFormat": "{{job}} p95",
|
||||
"refId": "B"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": {
|
||||
"axisBorderShow": false,
|
||||
"axisCenteredZero": false,
|
||||
"axisColorMode": "text",
|
||||
"axisLabel": "latency",
|
||||
"axisPlacement": "auto",
|
||||
"drawStyle": "line",
|
||||
"fillOpacity": 10,
|
||||
"gradientMode": "none",
|
||||
"lineInterpolation": "smooth",
|
||||
"lineWidth": 2,
|
||||
"pointSize": 5,
|
||||
"scaleDistribution": { "type": "linear" },
|
||||
"showPoints": "never",
|
||||
"spanNulls": false,
|
||||
"stacking": { "group": "A", "mode": "none" },
|
||||
"thresholdsStyle": { "mode": "off" }
|
||||
},
|
||||
"unit": "s",
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 0.5 },
|
||||
{ "color": "red", "value": 2 }
|
||||
]
|
||||
}
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"options": {
|
||||
"legend": { "calcs": ["mean", "max"], "displayMode": "table", "placement": "bottom", "showLegend": true },
|
||||
"tooltip": { "mode": "multi", "sort": "desc" }
|
||||
}
|
||||
},
|
||||
{
|
||||
"title": "Synthesis Time (Text Completion / RAG)",
|
||||
"type": "timeseries",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "tg-prometheus"
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 16 },
|
||||
"id": 5,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "histogram_quantile(0.50, sum(rate(tg_consumer_request_duration_seconds_bucket{job=~\"text-completion|text-completion-rag|prompt-rag\"}[5m])) by (le, job))",
|
||||
"legendFormat": "{{job}} p50",
|
||||
"refId": "A"
|
||||
},
|
||||
{
|
||||
"expr": "histogram_quantile(0.95, sum(rate(tg_consumer_request_duration_seconds_bucket{job=~\"text-completion|text-completion-rag|prompt-rag\"}[5m])) by (le, job))",
|
||||
"legendFormat": "{{job}} p95",
|
||||
"refId": "B"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": {
|
||||
"axisBorderShow": false,
|
||||
"axisCenteredZero": false,
|
||||
"axisColorMode": "text",
|
||||
"axisLabel": "latency",
|
||||
"axisPlacement": "auto",
|
||||
"drawStyle": "line",
|
||||
"fillOpacity": 10,
|
||||
"gradientMode": "none",
|
||||
"lineInterpolation": "smooth",
|
||||
"lineWidth": 2,
|
||||
"pointSize": 5,
|
||||
"scaleDistribution": { "type": "linear" },
|
||||
"showPoints": "never",
|
||||
"spanNulls": false,
|
||||
"stacking": { "group": "A", "mode": "none" },
|
||||
"thresholdsStyle": { "mode": "off" }
|
||||
},
|
||||
"unit": "s",
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 5 },
|
||||
{ "color": "red", "value": 20 }
|
||||
]
|
||||
}
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"options": {
|
||||
"legend": { "calcs": ["mean", "max"], "displayMode": "table", "placement": "bottom", "showLegend": true },
|
||||
"tooltip": { "mode": "multi", "sort": "desc" }
|
||||
}
|
||||
},
|
||||
{
|
||||
"title": "RAG Pipeline Throughput",
|
||||
"type": "timeseries",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "tg-prometheus"
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 24, "x": 0, "y": 24 },
|
||||
"id": 6,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(tg_consumer_processing_total{job=~\"graph-rag|document-rag\", status=\"success\"}[5m])) by (job)",
|
||||
"legendFormat": "{{job}} success/s",
|
||||
"refId": "A"
|
||||
},
|
||||
{
|
||||
"expr": "sum(rate(tg_consumer_processing_total{job=~\"graph-rag|document-rag\", status=\"error\"}[5m])) by (job)",
|
||||
"legendFormat": "{{job}} errors/s",
|
||||
"refId": "B"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": {
|
||||
"axisBorderShow": false,
|
||||
"axisCenteredZero": false,
|
||||
"axisColorMode": "text",
|
||||
"axisLabel": "queries/s",
|
||||
"axisPlacement": "auto",
|
||||
"drawStyle": "line",
|
||||
"fillOpacity": 15,
|
||||
"gradientMode": "none",
|
||||
"lineInterpolation": "smooth",
|
||||
"lineWidth": 2,
|
||||
"pointSize": 5,
|
||||
"scaleDistribution": { "type": "linear" },
|
||||
"showPoints": "never",
|
||||
"spanNulls": false,
|
||||
"stacking": { "group": "A", "mode": "none" },
|
||||
"thresholdsStyle": { "mode": "off" }
|
||||
},
|
||||
"unit": "ops",
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null }
|
||||
]
|
||||
}
|
||||
},
|
||||
"overrides": [
|
||||
{
|
||||
"matcher": { "id": "byRegexp", "options": ".*errors.*" },
|
||||
"properties": [
|
||||
{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } },
|
||||
{ "id": "custom.fillOpacity", "value": 30 }
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"legend": { "calcs": ["mean", "max"], "displayMode": "table", "placement": "bottom", "showLegend": true },
|
||||
"tooltip": { "mode": "multi", "sort": "desc" }
|
||||
}
|
||||
}
|
||||
],
|
||||
"schemaVersion": 39,
|
||||
"tags": ["trustgraph", "rag", "pipeline"],
|
||||
"templating": {
|
||||
"list": []
|
||||
},
|
||||
"time": {
|
||||
"from": "now-1h",
|
||||
"to": "now"
|
||||
},
|
||||
"timepicker": {},
|
||||
"timezone": "browser",
|
||||
"title": "TrustGraph - RAG Pipeline",
|
||||
"uid": "tg-rag-pipeline",
|
||||
"version": 1
|
||||
}
|
||||
14
ts/deploy/grafana/provisioning/dashboards.yml
Normal file
14
ts/deploy/grafana/provisioning/dashboards.yml
Normal file
|
|
@ -0,0 +1,14 @@
|
|||
apiVersion: 1
|
||||
|
||||
providers:
|
||||
- name: "TrustGraph"
|
||||
orgId: 1
|
||||
folder: "TrustGraph"
|
||||
folderUid: "trustgraph-dashboards"
|
||||
type: file
|
||||
disableDeletion: false
|
||||
updateIntervalSeconds: 30
|
||||
allowUiUpdates: true
|
||||
options:
|
||||
path: /var/lib/grafana/dashboards
|
||||
foldersFromFilesStructure: false
|
||||
49
ts/deploy/grafana/provisioning/datasources.yml
Normal file
49
ts/deploy/grafana/provisioning/datasources.yml
Normal file
|
|
@ -0,0 +1,49 @@
|
|||
apiVersion: 1
|
||||
|
||||
prune: true
|
||||
|
||||
datasources:
|
||||
- name: Prometheus
|
||||
type: prometheus
|
||||
access: proxy
|
||||
orgId: 1
|
||||
uid: "tg-prometheus"
|
||||
url: http://prometheus:9090
|
||||
basicAuth: false
|
||||
isDefault: true
|
||||
editable: true
|
||||
|
||||
- name: Loki
|
||||
type: loki
|
||||
access: proxy
|
||||
orgId: 1
|
||||
uid: "tg-loki"
|
||||
url: http://loki:3100
|
||||
basicAuth: false
|
||||
editable: true
|
||||
|
||||
- name: Tempo
|
||||
type: tempo
|
||||
access: proxy
|
||||
orgId: 1
|
||||
uid: "tg-tempo"
|
||||
url: http://tempo:3200
|
||||
basicAuth: false
|
||||
editable: true
|
||||
jsonData:
|
||||
tracesToLogsV2:
|
||||
datasourceUid: "tg-loki"
|
||||
spanStartTimeShift: "-1h"
|
||||
spanEndTimeShift: "1h"
|
||||
filterByTraceID: true
|
||||
filterBySpanID: false
|
||||
tracesToMetrics:
|
||||
datasourceUid: "tg-prometheus"
|
||||
serviceMap:
|
||||
datasourceUid: "tg-prometheus"
|
||||
nodeGraph:
|
||||
enabled: true
|
||||
search:
|
||||
hide: false
|
||||
lokiSearch:
|
||||
datasourceUid: "tg-loki"
|
||||
52
ts/deploy/loki/loki-config.yml
Normal file
52
ts/deploy/loki/loki-config.yml
Normal file
|
|
@ -0,0 +1,52 @@
|
|||
auth_enabled: false
|
||||
|
||||
server:
|
||||
http_listen_port: 3100
|
||||
grpc_listen_port: 9096
|
||||
log_level: warn
|
||||
|
||||
common:
|
||||
instance_addr: 127.0.0.1
|
||||
path_prefix: /tmp/loki
|
||||
storage:
|
||||
filesystem:
|
||||
chunks_directory: /tmp/loki/chunks
|
||||
rules_directory: /tmp/loki/rules
|
||||
replication_factor: 1
|
||||
ring:
|
||||
kvstore:
|
||||
store: inmemory
|
||||
|
||||
query_range:
|
||||
results_cache:
|
||||
cache:
|
||||
embedded_cache:
|
||||
enabled: true
|
||||
max_size_mb: 100
|
||||
|
||||
limits_config:
|
||||
metric_aggregation_enabled: true
|
||||
|
||||
schema_config:
|
||||
configs:
|
||||
- from: 2024-01-01
|
||||
store: tsdb
|
||||
object_store: filesystem
|
||||
schema: v13
|
||||
index:
|
||||
prefix: index_
|
||||
period: 24h
|
||||
|
||||
pattern_ingester:
|
||||
enabled: true
|
||||
metric_aggregation:
|
||||
loki_address: localhost:3100
|
||||
|
||||
ruler:
|
||||
alertmanager_url: http://localhost:9093
|
||||
|
||||
frontend:
|
||||
encoding: protobuf
|
||||
|
||||
analytics:
|
||||
reporting_enabled: false
|
||||
41
ts/deploy/otel-collector/config.yml
Normal file
41
ts/deploy/otel-collector/config.yml
Normal file
|
|
@ -0,0 +1,41 @@
|
|||
receivers:
|
||||
otlp:
|
||||
protocols:
|
||||
grpc:
|
||||
endpoint: "0.0.0.0:4317"
|
||||
http:
|
||||
endpoint: "0.0.0.0:4318"
|
||||
|
||||
processors:
|
||||
batch:
|
||||
timeout: 5s
|
||||
send_batch_size: 1024
|
||||
|
||||
exporters:
|
||||
otlp/tempo:
|
||||
endpoint: "tempo:4317"
|
||||
tls:
|
||||
insecure: true
|
||||
|
||||
prometheus:
|
||||
endpoint: "0.0.0.0:8889"
|
||||
namespace: "tg"
|
||||
resource_to_telemetry_conversion:
|
||||
enabled: true
|
||||
|
||||
debug:
|
||||
verbosity: basic
|
||||
|
||||
service:
|
||||
pipelines:
|
||||
traces:
|
||||
receivers: [otlp]
|
||||
processors: [batch]
|
||||
exporters: [otlp/tempo]
|
||||
metrics:
|
||||
receivers: [otlp]
|
||||
processors: [batch]
|
||||
exporters: [prometheus]
|
||||
telemetry:
|
||||
logs:
|
||||
level: warn
|
||||
36
ts/deploy/prometheus/prometheus.yml
Normal file
36
ts/deploy/prometheus/prometheus.yml
Normal file
|
|
@ -0,0 +1,36 @@
|
|||
global:
|
||||
scrape_interval: 15s
|
||||
evaluation_interval: 15s
|
||||
|
||||
external_labels:
|
||||
monitor: "trustgraph-ts"
|
||||
|
||||
scrape_configs:
|
||||
# Prometheus self-monitoring
|
||||
- job_name: "prometheus"
|
||||
scrape_interval: 15s
|
||||
static_configs:
|
||||
- targets:
|
||||
- "prometheus:9090"
|
||||
|
||||
# NATS monitoring
|
||||
- job_name: "nats"
|
||||
scrape_interval: 15s
|
||||
metrics_path: "/varz"
|
||||
static_configs:
|
||||
- targets:
|
||||
- "nats:8222"
|
||||
|
||||
# OpenTelemetry Collector (exposes Prometheus metrics from OTLP pipeline)
|
||||
- job_name: "otel-collector"
|
||||
scrape_interval: 15s
|
||||
static_configs:
|
||||
- targets:
|
||||
- "otel-collector:8889"
|
||||
|
||||
# TrustGraph gateway (enabled when gateway container is running)
|
||||
- job_name: "gateway"
|
||||
scrape_interval: 5s
|
||||
static_configs:
|
||||
- targets:
|
||||
- "gateway:8000"
|
||||
49
ts/deploy/tempo/tempo-config.yml
Normal file
49
ts/deploy/tempo/tempo-config.yml
Normal file
|
|
@ -0,0 +1,49 @@
|
|||
server:
|
||||
http_listen_port: 3200
|
||||
|
||||
distributor:
|
||||
receivers:
|
||||
otlp:
|
||||
protocols:
|
||||
grpc:
|
||||
endpoint: "0.0.0.0:4317"
|
||||
http:
|
||||
endpoint: "0.0.0.0:4318"
|
||||
|
||||
ingester:
|
||||
max_block_duration: 5m
|
||||
|
||||
compactor:
|
||||
compaction:
|
||||
block_retention: 48h
|
||||
|
||||
metrics_generator:
|
||||
registry:
|
||||
external_labels:
|
||||
source: tempo
|
||||
cluster: trustgraph-dev
|
||||
storage:
|
||||
path: /tmp/tempo/generator/wal
|
||||
remote_write:
|
||||
- url: http://prometheus:9090/api/v1/write
|
||||
send_exemplars: true
|
||||
|
||||
storage:
|
||||
trace:
|
||||
backend: local
|
||||
wal:
|
||||
path: /tmp/tempo/wal
|
||||
local:
|
||||
path: /tmp/tempo/blocks
|
||||
|
||||
overrides:
|
||||
defaults:
|
||||
metrics_generator:
|
||||
processors:
|
||||
- service-graphs
|
||||
- span-metrics
|
||||
|
||||
search_enabled: true
|
||||
|
||||
analytics:
|
||||
reporting_enabled: false
|
||||
Loading…
Add table
Add a link
Reference in a new issue