mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-05-25 19:15:18 +02:00
feat(observability): integrate OpenTelemetry collector and configuration for enhanced telemetry
This commit is contained in:
parent
51e4d8b489
commit
df698e0216
6 changed files with 160 additions and 7 deletions
|
|
@ -304,6 +304,28 @@ STT_SERVICE=local/base
|
||||||
# LANGSMITH_API_KEY=
|
# LANGSMITH_API_KEY=
|
||||||
# LANGSMITH_PROJECT=surfsense
|
# LANGSMITH_PROJECT=surfsense
|
||||||
|
|
||||||
|
# OpenTelemetry traces and metrics.
|
||||||
|
# Enable the collector with: docker compose --profile observability up -d
|
||||||
|
# SURFSENSE_ENABLE_OTEL=true
|
||||||
|
# OTEL_EXPORTER_OTLP_ENDPOINT=http://otel-collector:4317
|
||||||
|
# OTEL_EXPORTER_OTLP_PROTOCOL=grpc
|
||||||
|
# OTEL_RESOURCE_ATTRIBUTES=deployment.environment.name=production,service.namespace=surfsense
|
||||||
|
#
|
||||||
|
# Emergency kill switch.
|
||||||
|
# OTEL_SDK_DISABLED=true
|
||||||
|
#
|
||||||
|
# Grafana Cloud OTLP credentials. These are used only by the collector container.
|
||||||
|
# GRAFANA_CLOUD_OTLP_ENDPOINT=https://otlp-gateway-<region>.grafana.net/otlp
|
||||||
|
# GRAFANA_CLOUD_INSTANCE_ID=
|
||||||
|
# GRAFANA_CLOUD_API_KEY=
|
||||||
|
#
|
||||||
|
# Optional host port overrides for the bundled OTel Collector. Only change
|
||||||
|
# these if the host already uses 4317/4318/13133; backend containers still use
|
||||||
|
# the internal Docker endpoint above.
|
||||||
|
# OTEL_GRPC_PORT=4317
|
||||||
|
# OTEL_HTTP_PORT=4318
|
||||||
|
# OTEL_HEALTH_PORT=13133
|
||||||
|
|
||||||
# ------------------------------------------------------------------------------
|
# ------------------------------------------------------------------------------
|
||||||
# Advanced (optional)
|
# Advanced (optional)
|
||||||
# ------------------------------------------------------------------------------
|
# ------------------------------------------------------------------------------
|
||||||
|
|
|
||||||
|
|
@ -61,6 +61,29 @@ services:
|
||||||
timeout: 5s
|
timeout: 5s
|
||||||
retries: 5
|
retries: 5
|
||||||
|
|
||||||
|
otel-collector:
|
||||||
|
image: otel/opentelemetry-collector-contrib:0.152.1
|
||||||
|
profiles:
|
||||||
|
- observability
|
||||||
|
command: ["--config=/etc/otelcol/config.yaml"]
|
||||||
|
volumes:
|
||||||
|
- ./otel-collector/config.yaml:/etc/otelcol/config.yaml:ro
|
||||||
|
environment:
|
||||||
|
GRAFANA_CLOUD_OTLP_ENDPOINT: ${GRAFANA_CLOUD_OTLP_ENDPOINT:-}
|
||||||
|
GRAFANA_CLOUD_INSTANCE_ID: ${GRAFANA_CLOUD_INSTANCE_ID:-}
|
||||||
|
GRAFANA_CLOUD_API_KEY: ${GRAFANA_CLOUD_API_KEY:-}
|
||||||
|
ports:
|
||||||
|
- "${OTEL_GRPC_PORT:-4317}:4317"
|
||||||
|
- "${OTEL_HTTP_PORT:-4318}:4318"
|
||||||
|
- "${OTEL_HEALTH_PORT:-13133}:13133"
|
||||||
|
mem_limit: 2g
|
||||||
|
restart: unless-stopped
|
||||||
|
healthcheck:
|
||||||
|
test: ["CMD", "/otelcol-contrib", "--version"]
|
||||||
|
interval: 30s
|
||||||
|
timeout: 5s
|
||||||
|
retries: 3
|
||||||
|
|
||||||
searxng:
|
searxng:
|
||||||
image: searxng/searxng:2026.3.13-3c1f68c59
|
image: searxng/searxng:2026.3.13-3c1f68c59
|
||||||
volumes:
|
volumes:
|
||||||
|
|
|
||||||
81
docker/otel-collector/config.yaml
Normal file
81
docker/otel-collector/config.yaml
Normal file
|
|
@ -0,0 +1,81 @@
|
||||||
|
extensions:
|
||||||
|
health_check:
|
||||||
|
endpoint: 0.0.0.0:13133
|
||||||
|
basicauth/grafana_cloud:
|
||||||
|
client_auth:
|
||||||
|
username: ${env:GRAFANA_CLOUD_INSTANCE_ID}
|
||||||
|
password: ${env:GRAFANA_CLOUD_API_KEY}
|
||||||
|
|
||||||
|
receivers:
|
||||||
|
otlp:
|
||||||
|
protocols:
|
||||||
|
grpc:
|
||||||
|
endpoint: 0.0.0.0:4317
|
||||||
|
http:
|
||||||
|
endpoint: 0.0.0.0:4318
|
||||||
|
|
||||||
|
processors:
|
||||||
|
# Percentage limits are calculated against the collector container memory limit.
|
||||||
|
# Keep docker-compose.yml/Coolify memory limit set for predictability.
|
||||||
|
memory_limiter:
|
||||||
|
check_interval: 1s
|
||||||
|
limit_percentage: 80
|
||||||
|
spike_limit_percentage: 25
|
||||||
|
|
||||||
|
attributes/scrub:
|
||||||
|
actions:
|
||||||
|
- key: http.request.header.authorization
|
||||||
|
action: delete
|
||||||
|
- key: http.request.header.cookie
|
||||||
|
action: delete
|
||||||
|
- key: db.statement
|
||||||
|
action: delete
|
||||||
|
|
||||||
|
tail_sampling:
|
||||||
|
decision_wait: 10s
|
||||||
|
num_traces: 50000
|
||||||
|
expected_new_traces_per_sec: 100
|
||||||
|
policies:
|
||||||
|
- name: errors
|
||||||
|
type: status_code
|
||||||
|
status_code:
|
||||||
|
status_codes: [ERROR]
|
||||||
|
- name: slow-requests
|
||||||
|
type: latency
|
||||||
|
latency:
|
||||||
|
threshold_ms: 500
|
||||||
|
- name: baseline
|
||||||
|
type: probabilistic
|
||||||
|
probabilistic:
|
||||||
|
sampling_percentage: 100
|
||||||
|
|
||||||
|
batch:
|
||||||
|
timeout: 5s
|
||||||
|
send_batch_size: 1024
|
||||||
|
send_batch_max_size: 2048
|
||||||
|
|
||||||
|
exporters:
|
||||||
|
otlp_http/grafana_cloud:
|
||||||
|
endpoint: ${env:GRAFANA_CLOUD_OTLP_ENDPOINT}
|
||||||
|
auth:
|
||||||
|
authenticator: basicauth/grafana_cloud
|
||||||
|
sending_queue:
|
||||||
|
enabled: true
|
||||||
|
queue_size: 10000
|
||||||
|
retry_on_failure:
|
||||||
|
enabled: true
|
||||||
|
initial_interval: 5s
|
||||||
|
max_interval: 30s
|
||||||
|
max_elapsed_time: 300s
|
||||||
|
|
||||||
|
service:
|
||||||
|
extensions: [health_check, basicauth/grafana_cloud]
|
||||||
|
pipelines:
|
||||||
|
traces:
|
||||||
|
receivers: [otlp]
|
||||||
|
processors: [memory_limiter, attributes/scrub, tail_sampling, batch]
|
||||||
|
exporters: [otlp_http/grafana_cloud]
|
||||||
|
metrics:
|
||||||
|
receivers: [otlp]
|
||||||
|
processors: [memory_limiter, batch]
|
||||||
|
exporters: [otlp_http/grafana_cloud]
|
||||||
|
|
@ -310,10 +310,9 @@ LANGSMITH_PROJECT=surfsense
|
||||||
# use http://otel-lgtm:4317 instead.
|
# use http://otel-lgtm:4317 instead.
|
||||||
# OTEL_EXPORTER_OTLP_ENDPOINT=http://otel-collector:4317
|
# OTEL_EXPORTER_OTLP_ENDPOINT=http://otel-collector:4317
|
||||||
# OTEL_EXPORTER_OTLP_PROTOCOL=grpc # or http/protobuf
|
# OTEL_EXPORTER_OTLP_PROTOCOL=grpc # or http/protobuf
|
||||||
# OTEL_SERVICE_NAME=surfsense-backend
|
# OTEL_RESOURCE_ATTRIBUTES=deployment.environment.name=production,service.namespace=surfsense
|
||||||
# OTEL_RESOURCE_ATTRIBUTES=deployment.environment=production
|
# OTEL_METRIC_EXPORT_INTERVAL=300000 # ms; 5 minutes
|
||||||
# OTEL_METRIC_EXPORT_INTERVAL=60000 # ms
|
# OTEL_SDK_DISABLED=true # emergency kill-switch
|
||||||
# OTEL_SDK_DISABLED=false # spec kill-switch
|
|
||||||
|
|
||||||
# Skills + subagents
|
# Skills + subagents
|
||||||
# SURFSENSE_ENABLE_SKILLS=false
|
# SURFSENSE_ENABLE_SKILLS=false
|
||||||
|
|
|
||||||
|
|
@ -71,7 +71,7 @@ def _build_resource():
|
||||||
"service.name": os.environ.get("OTEL_SERVICE_NAME", "surfsense-backend"),
|
"service.name": os.environ.get("OTEL_SERVICE_NAME", "surfsense-backend"),
|
||||||
"service.version": _package_version(),
|
"service.version": _package_version(),
|
||||||
"service.instance.id": socket.gethostname(),
|
"service.instance.id": socket.gethostname(),
|
||||||
"deployment.environment": _deployment_environment(),
|
"deployment.environment.name": _deployment_environment(),
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
@ -336,7 +336,10 @@ def init_logs() -> None:
|
||||||
def _run() -> None:
|
def _run() -> None:
|
||||||
from opentelemetry.instrumentation.logging import LoggingInstrumentor
|
from opentelemetry.instrumentation.logging import LoggingInstrumentor
|
||||||
|
|
||||||
LoggingInstrumentor().instrument()
|
# Required for stdlib LogRecords to receive otelTraceID/otelSpanID.
|
||||||
|
# logging.basicConfig is already installed by main.py, so this does not
|
||||||
|
# take over formatting in normal app startup.
|
||||||
|
LoggingInstrumentor().instrument(set_logging_format=True)
|
||||||
|
|
||||||
if _safe_instrument("logging", _run):
|
if _safe_instrument("logging", _run):
|
||||||
_LOGS_INITIALIZED = True
|
_LOGS_INITIALIZED = True
|
||||||
|
|
|
||||||
|
|
@ -113,12 +113,37 @@ class TestBootstrapConfig:
|
||||||
resource = bootstrap._build_resource()
|
resource = bootstrap._build_resource()
|
||||||
attrs = dict(resource.attributes)
|
attrs = dict(resource.attributes)
|
||||||
assert attrs["service.name"] == "custom-backend"
|
assert attrs["service.name"] == "custom-backend"
|
||||||
assert attrs["deployment.environment"] == "test"
|
assert attrs["deployment.environment.name"] == "test"
|
||||||
assert attrs["service.instance.id"]
|
assert attrs["service.instance.id"]
|
||||||
|
|
||||||
def test_shutdown_is_safe_without_providers(self) -> None:
|
def test_shutdown_is_safe_without_providers(self) -> None:
|
||||||
bootstrap.shutdown_otel()
|
bootstrap.shutdown_otel()
|
||||||
|
|
||||||
|
def test_init_logs_enables_log_correlation(
|
||||||
|
self, monkeypatch: pytest.MonkeyPatch
|
||||||
|
) -> None:
|
||||||
|
calls: list[dict[str, object]] = []
|
||||||
|
|
||||||
|
class FakeLoggingInstrumentor:
|
||||||
|
def instrument(self, **kwargs: object) -> None:
|
||||||
|
calls.append(kwargs)
|
||||||
|
|
||||||
|
def fake_safe_instrument(name: str, callback):
|
||||||
|
assert name == "logging"
|
||||||
|
monkeypatch.setattr(
|
||||||
|
"opentelemetry.instrumentation.logging.LoggingInstrumentor",
|
||||||
|
FakeLoggingInstrumentor,
|
||||||
|
)
|
||||||
|
callback()
|
||||||
|
return True
|
||||||
|
|
||||||
|
monkeypatch.setattr(bootstrap, "_LOGS_INITIALIZED", False)
|
||||||
|
monkeypatch.setattr(bootstrap, "_safe_instrument", fake_safe_instrument)
|
||||||
|
|
||||||
|
bootstrap.init_logs()
|
||||||
|
|
||||||
|
assert calls == [{"set_logging_format": True}]
|
||||||
|
|
||||||
|
|
||||||
class TestMetricHelpers:
|
class TestMetricHelpers:
|
||||||
def test_all_metric_helpers_noop_safely_when_disabled(self) -> None:
|
def test_all_metric_helpers_noop_safely_when_disabled(self) -> None:
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue