feat(observability): integrate OpenTelemetry collector and configuration for enhanced telemetry

2026-05-25 19:15:18 +02:00 · 2026-05-23 00:17:23 +05:30 · 2026-05-23 00:17:23 +05:30 · df698e0216
commit df698e0216
parent 51e4d8b489
6 changed files with 160 additions and 7 deletions
--- a/docker/.env.example
+++ b/docker/.env.example
@ -304,6 +304,28 @@ STT_SERVICE=local/base
 # LANGSMITH_API_KEY=
 # LANGSMITH_PROJECT=surfsense

+# OpenTelemetry traces and metrics.
+# Enable the collector with: docker compose --profile observability up -d
+# SURFSENSE_ENABLE_OTEL=true
+# OTEL_EXPORTER_OTLP_ENDPOINT=http://otel-collector:4317
+# OTEL_EXPORTER_OTLP_PROTOCOL=grpc
+# OTEL_RESOURCE_ATTRIBUTES=deployment.environment.name=production,service.namespace=surfsense
+#
+# Emergency kill switch.
+# OTEL_SDK_DISABLED=true
+#
+# Grafana Cloud OTLP credentials. These are used only by the collector container.
+# GRAFANA_CLOUD_OTLP_ENDPOINT=https://otlp-gateway-<region>.grafana.net/otlp
+# GRAFANA_CLOUD_INSTANCE_ID=
+# GRAFANA_CLOUD_API_KEY=
+#
+# Optional host port overrides for the bundled OTel Collector. Only change
+# these if the host already uses 4317/4318/13133; backend containers still use
+# the internal Docker endpoint above.
+# OTEL_GRPC_PORT=4317
+# OTEL_HTTP_PORT=4318
+# OTEL_HEALTH_PORT=13133
+
 # ------------------------------------------------------------------------------
 # Advanced (optional)
 # ------------------------------------------------------------------------------
--- a/docker/docker-compose.yml
+++ b/docker/docker-compose.yml
@ -61,6 +61,29 @@ services:
      timeout: 5s
      retries: 5

+  otel-collector:
+    image: otel/opentelemetry-collector-contrib:0.152.1
+    profiles:
+      - observability
+    command: ["--config=/etc/otelcol/config.yaml"]
+    volumes:
+      - ./otel-collector/config.yaml:/etc/otelcol/config.yaml:ro
+    environment:
+      GRAFANA_CLOUD_OTLP_ENDPOINT: ${GRAFANA_CLOUD_OTLP_ENDPOINT:-}
+      GRAFANA_CLOUD_INSTANCE_ID: ${GRAFANA_CLOUD_INSTANCE_ID:-}
+      GRAFANA_CLOUD_API_KEY: ${GRAFANA_CLOUD_API_KEY:-}
+    ports:
+      - "${OTEL_GRPC_PORT:-4317}:4317"
+      - "${OTEL_HTTP_PORT:-4318}:4318"
+      - "${OTEL_HEALTH_PORT:-13133}:13133"
+    mem_limit: 2g
+    restart: unless-stopped
+    healthcheck:
+      test: ["CMD", "/otelcol-contrib", "--version"]
+      interval: 30s
+      timeout: 5s
+      retries: 3
+
  searxng:
    image: searxng/searxng:2026.3.13-3c1f68c59
    volumes:
--- a/docker/otel-collector/config.yaml
+++ b/docker/otel-collector/config.yaml
@ -0,0 +1,81 @@
+extensions:
+  health_check:
+    endpoint: 0.0.0.0:13133
+  basicauth/grafana_cloud:
+    client_auth:
+      username: ${env:GRAFANA_CLOUD_INSTANCE_ID}
+      password: ${env:GRAFANA_CLOUD_API_KEY}
+
+receivers:
+  otlp:
+    protocols:
+      grpc:
+        endpoint: 0.0.0.0:4317
+      http:
+        endpoint: 0.0.0.0:4318
+
+processors:
+  # Percentage limits are calculated against the collector container memory limit.
+  # Keep docker-compose.yml/Coolify memory limit set for predictability.
+  memory_limiter:
+    check_interval: 1s
+    limit_percentage: 80
+    spike_limit_percentage: 25
+
+  attributes/scrub:
+    actions:
+      - key: http.request.header.authorization
+        action: delete
+      - key: http.request.header.cookie
+        action: delete
+      - key: db.statement
+        action: delete
+
+  tail_sampling:
+    decision_wait: 10s
+    num_traces: 50000
+    expected_new_traces_per_sec: 100
+    policies:
+      - name: errors
+        type: status_code
+        status_code:
+          status_codes: [ERROR]
+      - name: slow-requests
+        type: latency
+        latency:
+          threshold_ms: 500
+      - name: baseline
+        type: probabilistic
+        probabilistic:
+          sampling_percentage: 100
+
+  batch:
+    timeout: 5s
+    send_batch_size: 1024
+    send_batch_max_size: 2048
+
+exporters:
+  otlp_http/grafana_cloud:
+    endpoint: ${env:GRAFANA_CLOUD_OTLP_ENDPOINT}
+    auth:
+      authenticator: basicauth/grafana_cloud
+    sending_queue:
+      enabled: true
+      queue_size: 10000
+    retry_on_failure:
+      enabled: true
+      initial_interval: 5s
+      max_interval: 30s
+      max_elapsed_time: 300s
+
+service:
+  extensions: [health_check, basicauth/grafana_cloud]
+  pipelines:
+    traces:
+      receivers: [otlp]
+      processors: [memory_limiter, attributes/scrub, tail_sampling, batch]
+      exporters: [otlp_http/grafana_cloud]
+    metrics:
+      receivers: [otlp]
+      processors: [memory_limiter, batch]
+      exporters: [otlp_http/grafana_cloud]
--- a/surfsense_backend/.env.example
+++ b/surfsense_backend/.env.example
@ -310,10 +310,9 @@ LANGSMITH_PROJECT=surfsense
 # use http://otel-lgtm:4317 instead.
 # OTEL_EXPORTER_OTLP_ENDPOINT=http://otel-collector:4317
 # OTEL_EXPORTER_OTLP_PROTOCOL=grpc                        # or http/protobuf
-# OTEL_SERVICE_NAME=surfsense-backend
-# OTEL_RESOURCE_ATTRIBUTES=deployment.environment=production
-# OTEL_METRIC_EXPORT_INTERVAL=60000                       # ms
-# OTEL_SDK_DISABLED=false                                 # spec kill-switch
+# OTEL_RESOURCE_ATTRIBUTES=deployment.environment.name=production,service.namespace=surfsense
+# OTEL_METRIC_EXPORT_INTERVAL=300000                       # ms; 5 minutes
+# OTEL_SDK_DISABLED=true                                  # emergency kill-switch

 # Skills + subagents
 # SURFSENSE_ENABLE_SKILLS=false
--- a/surfsense_backend/app/observability/bootstrap.py
+++ b/surfsense_backend/app/observability/bootstrap.py
@ -71,7 +71,7 @@ def _build_resource():
            "service.name": os.environ.get("OTEL_SERVICE_NAME", "surfsense-backend"),
            "service.version": _package_version(),
            "service.instance.id": socket.gethostname(),
-            "deployment.environment": _deployment_environment(),
+            "deployment.environment.name": _deployment_environment(),
        }
    )

@ -336,7 +336,10 @@ def init_logs() -> None:
    def _run() -> None:
        from opentelemetry.instrumentation.logging import LoggingInstrumentor

-        LoggingInstrumentor().instrument()
+        # Required for stdlib LogRecords to receive otelTraceID/otelSpanID.
+        # logging.basicConfig is already installed by main.py, so this does not
+        # take over formatting in normal app startup.
+        LoggingInstrumentor().instrument(set_logging_format=True)

    if _safe_instrument("logging", _run):
        _LOGS_INITIALIZED = True
--- a/surfsense_backend/tests/unit/observability/test_otel.py
+++ b/surfsense_backend/tests/unit/observability/test_otel.py
@ -113,12 +113,37 @@ class TestBootstrapConfig:
        resource = bootstrap._build_resource()
        attrs = dict(resource.attributes)
        assert attrs["service.name"] == "custom-backend"
-        assert attrs["deployment.environment"] == "test"
+        assert attrs["deployment.environment.name"] == "test"
        assert attrs["service.instance.id"]

    def test_shutdown_is_safe_without_providers(self) -> None:
        bootstrap.shutdown_otel()

+    def test_init_logs_enables_log_correlation(
+        self, monkeypatch: pytest.MonkeyPatch
+    ) -> None:
+        calls: list[dict[str, object]] = []
+
+        class FakeLoggingInstrumentor:
+            def instrument(self, **kwargs: object) -> None:
+                calls.append(kwargs)
+
+        def fake_safe_instrument(name: str, callback):
+            assert name == "logging"
+            monkeypatch.setattr(
+                "opentelemetry.instrumentation.logging.LoggingInstrumentor",
+                FakeLoggingInstrumentor,
+            )
+            callback()
+            return True
+
+        monkeypatch.setattr(bootstrap, "_LOGS_INITIALIZED", False)
+        monkeypatch.setattr(bootstrap, "_safe_instrument", fake_safe_instrument)
+
+        bootstrap.init_logs()
+
+        assert calls == [{"set_logging_format": True}]
+

 class TestMetricHelpers:
    def test_all_metric_helpers_noop_safely_when_disabled(self) -> None: