feat(observability): integrate OpenTelemetry collector and configuration for enhanced telemetry

This commit is contained in:
Anish Sarkar 2026-05-23 00:17:23 +05:30
parent 51e4d8b489
commit df698e0216
6 changed files with 160 additions and 7 deletions

View file

@ -304,6 +304,28 @@ STT_SERVICE=local/base
# LANGSMITH_API_KEY=
# LANGSMITH_PROJECT=surfsense
# OpenTelemetry traces and metrics.
# Enable the collector with: docker compose --profile observability up -d
# SURFSENSE_ENABLE_OTEL=true
# OTEL_EXPORTER_OTLP_ENDPOINT=http://otel-collector:4317
# OTEL_EXPORTER_OTLP_PROTOCOL=grpc
# OTEL_RESOURCE_ATTRIBUTES=deployment.environment.name=production,service.namespace=surfsense
#
# Emergency kill switch.
# OTEL_SDK_DISABLED=true
#
# Grafana Cloud OTLP credentials. These are used only by the collector container.
# GRAFANA_CLOUD_OTLP_ENDPOINT=https://otlp-gateway-<region>.grafana.net/otlp
# GRAFANA_CLOUD_INSTANCE_ID=
# GRAFANA_CLOUD_API_KEY=
#
# Optional host port overrides for the bundled OTel Collector. Only change
# these if the host already uses 4317/4318/13133; backend containers still use
# the internal Docker endpoint above.
# OTEL_GRPC_PORT=4317
# OTEL_HTTP_PORT=4318
# OTEL_HEALTH_PORT=13133
# ------------------------------------------------------------------------------
# Advanced (optional)
# ------------------------------------------------------------------------------

View file

@ -61,6 +61,29 @@ services:
timeout: 5s
retries: 5
otel-collector:
image: otel/opentelemetry-collector-contrib:0.152.1
profiles:
- observability
command: ["--config=/etc/otelcol/config.yaml"]
volumes:
- ./otel-collector/config.yaml:/etc/otelcol/config.yaml:ro
environment:
GRAFANA_CLOUD_OTLP_ENDPOINT: ${GRAFANA_CLOUD_OTLP_ENDPOINT:-}
GRAFANA_CLOUD_INSTANCE_ID: ${GRAFANA_CLOUD_INSTANCE_ID:-}
GRAFANA_CLOUD_API_KEY: ${GRAFANA_CLOUD_API_KEY:-}
ports:
- "${OTEL_GRPC_PORT:-4317}:4317"
- "${OTEL_HTTP_PORT:-4318}:4318"
- "${OTEL_HEALTH_PORT:-13133}:13133"
mem_limit: 2g
restart: unless-stopped
healthcheck:
test: ["CMD", "/otelcol-contrib", "--version"]
interval: 30s
timeout: 5s
retries: 3
searxng:
image: searxng/searxng:2026.3.13-3c1f68c59
volumes:

View file

@ -0,0 +1,81 @@
extensions:
health_check:
endpoint: 0.0.0.0:13133
basicauth/grafana_cloud:
client_auth:
username: ${env:GRAFANA_CLOUD_INSTANCE_ID}
password: ${env:GRAFANA_CLOUD_API_KEY}
receivers:
otlp:
protocols:
grpc:
endpoint: 0.0.0.0:4317
http:
endpoint: 0.0.0.0:4318
processors:
# Percentage limits are calculated against the collector container memory limit.
# Keep docker-compose.yml/Coolify memory limit set for predictability.
memory_limiter:
check_interval: 1s
limit_percentage: 80
spike_limit_percentage: 25
attributes/scrub:
actions:
- key: http.request.header.authorization
action: delete
- key: http.request.header.cookie
action: delete
- key: db.statement
action: delete
tail_sampling:
decision_wait: 10s
num_traces: 50000
expected_new_traces_per_sec: 100
policies:
- name: errors
type: status_code
status_code:
status_codes: [ERROR]
- name: slow-requests
type: latency
latency:
threshold_ms: 500
- name: baseline
type: probabilistic
probabilistic:
sampling_percentage: 100
batch:
timeout: 5s
send_batch_size: 1024
send_batch_max_size: 2048
exporters:
otlp_http/grafana_cloud:
endpoint: ${env:GRAFANA_CLOUD_OTLP_ENDPOINT}
auth:
authenticator: basicauth/grafana_cloud
sending_queue:
enabled: true
queue_size: 10000
retry_on_failure:
enabled: true
initial_interval: 5s
max_interval: 30s
max_elapsed_time: 300s
service:
extensions: [health_check, basicauth/grafana_cloud]
pipelines:
traces:
receivers: [otlp]
processors: [memory_limiter, attributes/scrub, tail_sampling, batch]
exporters: [otlp_http/grafana_cloud]
metrics:
receivers: [otlp]
processors: [memory_limiter, batch]
exporters: [otlp_http/grafana_cloud]