diff --git a/docker/.env.example b/docker/.env.example index 4de35a5e9..2ff15e71d 100644 --- a/docker/.env.example +++ b/docker/.env.example @@ -304,6 +304,28 @@ STT_SERVICE=local/base # LANGSMITH_API_KEY= # LANGSMITH_PROJECT=surfsense +# OpenTelemetry traces and metrics. +# Enable the collector with: docker compose --profile observability up -d +# SURFSENSE_ENABLE_OTEL=true +# OTEL_EXPORTER_OTLP_ENDPOINT=http://otel-collector:4317 +# OTEL_EXPORTER_OTLP_PROTOCOL=grpc +# OTEL_RESOURCE_ATTRIBUTES=deployment.environment.name=production,service.namespace=surfsense +# +# Emergency kill switch. +# OTEL_SDK_DISABLED=true +# +# Grafana Cloud OTLP credentials. These are used only by the collector container. +# GRAFANA_CLOUD_OTLP_ENDPOINT=https://otlp-gateway-.grafana.net/otlp +# GRAFANA_CLOUD_INSTANCE_ID= +# GRAFANA_CLOUD_API_KEY= +# +# Optional host port overrides for the bundled OTel Collector. Only change +# these if the host already uses 4317/4318/13133; backend containers still use +# the internal Docker endpoint above. +# OTEL_GRPC_PORT=4317 +# OTEL_HTTP_PORT=4318 +# OTEL_HEALTH_PORT=13133 + # ------------------------------------------------------------------------------ # Advanced (optional) # ------------------------------------------------------------------------------ diff --git a/docker/docker-compose.yml b/docker/docker-compose.yml index 82d77f826..06a3ac79a 100644 --- a/docker/docker-compose.yml +++ b/docker/docker-compose.yml @@ -61,6 +61,29 @@ services: timeout: 5s retries: 5 + otel-collector: + image: otel/opentelemetry-collector-contrib:0.152.1 + profiles: + - observability + command: ["--config=/etc/otelcol/config.yaml"] + volumes: + - ./otel-collector/config.yaml:/etc/otelcol/config.yaml:ro + environment: + GRAFANA_CLOUD_OTLP_ENDPOINT: ${GRAFANA_CLOUD_OTLP_ENDPOINT:-} + GRAFANA_CLOUD_INSTANCE_ID: ${GRAFANA_CLOUD_INSTANCE_ID:-} + GRAFANA_CLOUD_API_KEY: ${GRAFANA_CLOUD_API_KEY:-} + ports: + - "${OTEL_GRPC_PORT:-4317}:4317" + - "${OTEL_HTTP_PORT:-4318}:4318" + - "${OTEL_HEALTH_PORT:-13133}:13133" + mem_limit: 2g + restart: unless-stopped + healthcheck: + test: ["CMD", "/otelcol-contrib", "--version"] + interval: 30s + timeout: 5s + retries: 3 + searxng: image: searxng/searxng:2026.3.13-3c1f68c59 volumes: diff --git a/docker/otel-collector/config.yaml b/docker/otel-collector/config.yaml new file mode 100644 index 000000000..f495eff9b --- /dev/null +++ b/docker/otel-collector/config.yaml @@ -0,0 +1,81 @@ +extensions: + health_check: + endpoint: 0.0.0.0:13133 + basicauth/grafana_cloud: + client_auth: + username: ${env:GRAFANA_CLOUD_INSTANCE_ID} + password: ${env:GRAFANA_CLOUD_API_KEY} + +receivers: + otlp: + protocols: + grpc: + endpoint: 0.0.0.0:4317 + http: + endpoint: 0.0.0.0:4318 + +processors: + # Percentage limits are calculated against the collector container memory limit. + # Keep docker-compose.yml/Coolify memory limit set for predictability. + memory_limiter: + check_interval: 1s + limit_percentage: 80 + spike_limit_percentage: 25 + + attributes/scrub: + actions: + - key: http.request.header.authorization + action: delete + - key: http.request.header.cookie + action: delete + - key: db.statement + action: delete + + tail_sampling: + decision_wait: 10s + num_traces: 50000 + expected_new_traces_per_sec: 100 + policies: + - name: errors + type: status_code + status_code: + status_codes: [ERROR] + - name: slow-requests + type: latency + latency: + threshold_ms: 500 + - name: baseline + type: probabilistic + probabilistic: + sampling_percentage: 100 + + batch: + timeout: 5s + send_batch_size: 1024 + send_batch_max_size: 2048 + +exporters: + otlp_http/grafana_cloud: + endpoint: ${env:GRAFANA_CLOUD_OTLP_ENDPOINT} + auth: + authenticator: basicauth/grafana_cloud + sending_queue: + enabled: true + queue_size: 10000 + retry_on_failure: + enabled: true + initial_interval: 5s + max_interval: 30s + max_elapsed_time: 300s + +service: + extensions: [health_check, basicauth/grafana_cloud] + pipelines: + traces: + receivers: [otlp] + processors: [memory_limiter, attributes/scrub, tail_sampling, batch] + exporters: [otlp_http/grafana_cloud] + metrics: + receivers: [otlp] + processors: [memory_limiter, batch] + exporters: [otlp_http/grafana_cloud] diff --git a/surfsense_backend/.env.example b/surfsense_backend/.env.example index 908f4645d..91b03770e 100644 --- a/surfsense_backend/.env.example +++ b/surfsense_backend/.env.example @@ -310,10 +310,9 @@ LANGSMITH_PROJECT=surfsense # use http://otel-lgtm:4317 instead. # OTEL_EXPORTER_OTLP_ENDPOINT=http://otel-collector:4317 # OTEL_EXPORTER_OTLP_PROTOCOL=grpc # or http/protobuf -# OTEL_SERVICE_NAME=surfsense-backend -# OTEL_RESOURCE_ATTRIBUTES=deployment.environment=production -# OTEL_METRIC_EXPORT_INTERVAL=60000 # ms -# OTEL_SDK_DISABLED=false # spec kill-switch +# OTEL_RESOURCE_ATTRIBUTES=deployment.environment.name=production,service.namespace=surfsense +# OTEL_METRIC_EXPORT_INTERVAL=300000 # ms; 5 minutes +# OTEL_SDK_DISABLED=true # emergency kill-switch # Skills + subagents # SURFSENSE_ENABLE_SKILLS=false diff --git a/surfsense_backend/app/observability/bootstrap.py b/surfsense_backend/app/observability/bootstrap.py index ad1d02ea8..f9ed65e7b 100644 --- a/surfsense_backend/app/observability/bootstrap.py +++ b/surfsense_backend/app/observability/bootstrap.py @@ -71,7 +71,7 @@ def _build_resource(): "service.name": os.environ.get("OTEL_SERVICE_NAME", "surfsense-backend"), "service.version": _package_version(), "service.instance.id": socket.gethostname(), - "deployment.environment": _deployment_environment(), + "deployment.environment.name": _deployment_environment(), } ) @@ -336,7 +336,10 @@ def init_logs() -> None: def _run() -> None: from opentelemetry.instrumentation.logging import LoggingInstrumentor - LoggingInstrumentor().instrument() + # Required for stdlib LogRecords to receive otelTraceID/otelSpanID. + # logging.basicConfig is already installed by main.py, so this does not + # take over formatting in normal app startup. + LoggingInstrumentor().instrument(set_logging_format=True) if _safe_instrument("logging", _run): _LOGS_INITIALIZED = True diff --git a/surfsense_backend/tests/unit/observability/test_otel.py b/surfsense_backend/tests/unit/observability/test_otel.py index be40cccde..52ccba82f 100644 --- a/surfsense_backend/tests/unit/observability/test_otel.py +++ b/surfsense_backend/tests/unit/observability/test_otel.py @@ -113,12 +113,37 @@ class TestBootstrapConfig: resource = bootstrap._build_resource() attrs = dict(resource.attributes) assert attrs["service.name"] == "custom-backend" - assert attrs["deployment.environment"] == "test" + assert attrs["deployment.environment.name"] == "test" assert attrs["service.instance.id"] def test_shutdown_is_safe_without_providers(self) -> None: bootstrap.shutdown_otel() + def test_init_logs_enables_log_correlation( + self, monkeypatch: pytest.MonkeyPatch + ) -> None: + calls: list[dict[str, object]] = [] + + class FakeLoggingInstrumentor: + def instrument(self, **kwargs: object) -> None: + calls.append(kwargs) + + def fake_safe_instrument(name: str, callback): + assert name == "logging" + monkeypatch.setattr( + "opentelemetry.instrumentation.logging.LoggingInstrumentor", + FakeLoggingInstrumentor, + ) + callback() + return True + + monkeypatch.setattr(bootstrap, "_LOGS_INITIALIZED", False) + monkeypatch.setattr(bootstrap, "_safe_instrument", fake_safe_instrument) + + bootstrap.init_logs() + + assert calls == [{"set_logging_format": True}] + class TestMetricHelpers: def test_all_metric_helpers_noop_safely_when_disabled(self) -> None: