feat(observability): integrate OpenTelemetry collector and configuration for enhanced telemetry

2026-07-10 22:32:16 +02:00 · 2026-05-23 00:17:23 +05:30 · 2026-05-23 00:17:23 +05:30 · df698e0216
commit df698e0216
parent 51e4d8b489
6 changed files with 160 additions and 7 deletions
--- a/docker/.env.example
+++ b/docker/.env.example
@ -304,6 +304,28 @@ STT_SERVICE=local/base
 # LANGSMITH_API_KEY=
 # LANGSMITH_PROJECT=surfsense

+# OpenTelemetry traces and metrics.
+# Enable the collector with: docker compose --profile observability up -d
+# SURFSENSE_ENABLE_OTEL=true
+# OTEL_EXPORTER_OTLP_ENDPOINT=http://otel-collector:4317
+# OTEL_EXPORTER_OTLP_PROTOCOL=grpc
+# OTEL_RESOURCE_ATTRIBUTES=deployment.environment.name=production,service.namespace=surfsense
+#
+# Emergency kill switch.
+# OTEL_SDK_DISABLED=true
+#
+# Grafana Cloud OTLP credentials. These are used only by the collector container.
+# GRAFANA_CLOUD_OTLP_ENDPOINT=https://otlp-gateway-<region>.grafana.net/otlp
+# GRAFANA_CLOUD_INSTANCE_ID=
+# GRAFANA_CLOUD_API_KEY=
+#
+# Optional host port overrides for the bundled OTel Collector. Only change
+# these if the host already uses 4317/4318/13133; backend containers still use
+# the internal Docker endpoint above.
+# OTEL_GRPC_PORT=4317
+# OTEL_HTTP_PORT=4318
+# OTEL_HEALTH_PORT=13133
+
 # ------------------------------------------------------------------------------
 # Advanced (optional)
 # ------------------------------------------------------------------------------
--- a/docker/docker-compose.yml
+++ b/docker/docker-compose.yml
@ -61,6 +61,29 @@ services:
      timeout: 5s
      retries: 5

+  otel-collector:
+    image: otel/opentelemetry-collector-contrib:0.152.1
+    profiles:
+      - observability
+    command: ["--config=/etc/otelcol/config.yaml"]
+    volumes:
+      - ./otel-collector/config.yaml:/etc/otelcol/config.yaml:ro
+    environment:
+      GRAFANA_CLOUD_OTLP_ENDPOINT: ${GRAFANA_CLOUD_OTLP_ENDPOINT:-}
+      GRAFANA_CLOUD_INSTANCE_ID: ${GRAFANA_CLOUD_INSTANCE_ID:-}
+      GRAFANA_CLOUD_API_KEY: ${GRAFANA_CLOUD_API_KEY:-}
+    ports:
+      - "${OTEL_GRPC_PORT:-4317}:4317"
+      - "${OTEL_HTTP_PORT:-4318}:4318"
+      - "${OTEL_HEALTH_PORT:-13133}:13133"
+    mem_limit: 2g
+    restart: unless-stopped
+    healthcheck:
+      test: ["CMD", "/otelcol-contrib", "--version"]
+      interval: 30s
+      timeout: 5s
+      retries: 3
+
  searxng:
    image: searxng/searxng:2026.3.13-3c1f68c59
    volumes:
--- a/docker/otel-collector/config.yaml
+++ b/docker/otel-collector/config.yaml
@ -0,0 +1,81 @@
+extensions:
+  health_check:
+    endpoint: 0.0.0.0:13133
+  basicauth/grafana_cloud:
+    client_auth:
+      username: ${env:GRAFANA_CLOUD_INSTANCE_ID}
+      password: ${env:GRAFANA_CLOUD_API_KEY}
+
+receivers:
+  otlp:
+    protocols:
+      grpc:
+        endpoint: 0.0.0.0:4317
+      http:
+        endpoint: 0.0.0.0:4318
+
+processors:
+  # Percentage limits are calculated against the collector container memory limit.
+  # Keep docker-compose.yml/Coolify memory limit set for predictability.
+  memory_limiter:
+    check_interval: 1s
+    limit_percentage: 80
+    spike_limit_percentage: 25
+
+  attributes/scrub:
+    actions:
+      - key: http.request.header.authorization
+        action: delete
+      - key: http.request.header.cookie
+        action: delete
+      - key: db.statement
+        action: delete
+
+  tail_sampling:
+    decision_wait: 10s
+    num_traces: 50000
+    expected_new_traces_per_sec: 100
+    policies:
+      - name: errors
+        type: status_code
+        status_code:
+          status_codes: [ERROR]
+      - name: slow-requests
+        type: latency
+        latency:
+          threshold_ms: 500
+      - name: baseline
+        type: probabilistic
+        probabilistic:
+          sampling_percentage: 100
+
+  batch:
+    timeout: 5s
+    send_batch_size: 1024
+    send_batch_max_size: 2048
+
+exporters:
+  otlp_http/grafana_cloud:
+    endpoint: ${env:GRAFANA_CLOUD_OTLP_ENDPOINT}
+    auth:
+      authenticator: basicauth/grafana_cloud
+    sending_queue:
+      enabled: true
+      queue_size: 10000
+    retry_on_failure:
+      enabled: true
+      initial_interval: 5s
+      max_interval: 30s
+      max_elapsed_time: 300s
+
+service:
+  extensions: [health_check, basicauth/grafana_cloud]
+  pipelines:
+    traces:
+      receivers: [otlp]
+      processors: [memory_limiter, attributes/scrub, tail_sampling, batch]
+      exporters: [otlp_http/grafana_cloud]
+    metrics:
+      receivers: [otlp]
+      processors: [memory_limiter, batch]
+      exporters: [otlp_http/grafana_cloud]