mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-05-27 19:25:15 +02:00
feat(observability): add telemetry error and event helpers
This commit is contained in:
parent
0fe2bba5a8
commit
dbb652d4f8
3 changed files with 241 additions and 8 deletions
|
|
@ -20,6 +20,19 @@ logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
_INSTRUMENTATION_NAME = "surfsense.platform"
|
_INSTRUMENTATION_NAME = "surfsense.platform"
|
||||||
_OBSERVABLES_REGISTERED = False
|
_OBSERVABLES_REGISTERED = False
|
||||||
|
_ERROR_CATEGORY_UNKNOWN = "unknown"
|
||||||
|
|
||||||
|
_ERROR_CATEGORY_HINTS: tuple[tuple[str, tuple[str, ...]], ...] = (
|
||||||
|
("rate_limited", ("ratelimit", "rate_limit", "toomanyrequests", "429")),
|
||||||
|
("auth_failed", ("authentication", "auth", "unauthorized", "forbidden")),
|
||||||
|
("quota_exhausted", ("quota", "insufficient", "credit", "billing")),
|
||||||
|
("timeout", ("timeout", "timedout", "deadline")),
|
||||||
|
("network_failed", ("connection", "connect", "network", "dns", "socket")),
|
||||||
|
("server_error", ("internalserver", "serviceunavailable", "badgateway", "gateway")),
|
||||||
|
("lock_contention", ("lock", "busy", "contention", "alreadyrunning")),
|
||||||
|
("unsupported_format", ("unsupported", "format", "filetype")),
|
||||||
|
("provider_error", ("provider", "apierror", "apistatus", "badrequest")),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def _package_version() -> str:
|
def _package_version() -> str:
|
||||||
|
|
@ -47,6 +60,36 @@ def _clean_attrs(attrs: dict[str, Any]) -> dict[str, str | int | float | bool]:
|
||||||
return cleaned
|
return cleaned
|
||||||
|
|
||||||
|
|
||||||
|
def _attrs_with_optional_error_category(
|
||||||
|
attrs: dict[str, Any], error_category: str | None
|
||||||
|
) -> dict[str, Any]:
|
||||||
|
if error_category:
|
||||||
|
return {**attrs, "error.category": error_category}
|
||||||
|
return attrs
|
||||||
|
|
||||||
|
|
||||||
|
def categorize_exception(exc: BaseException | None) -> str:
|
||||||
|
"""Return a low-cardinality category for an exception."""
|
||||||
|
if exc is None:
|
||||||
|
return _ERROR_CATEGORY_UNKNOWN
|
||||||
|
haystack = " ".join(
|
||||||
|
cls.__name__.replace("-", "").replace("_", "").lower()
|
||||||
|
for cls in type(exc).__mro__
|
||||||
|
)
|
||||||
|
for category, hints in _ERROR_CATEGORY_HINTS:
|
||||||
|
if any(hint in haystack for hint in hints):
|
||||||
|
return category
|
||||||
|
return _ERROR_CATEGORY_UNKNOWN
|
||||||
|
|
||||||
|
|
||||||
|
def parse_celery_task_label(task_name: str | None) -> str:
|
||||||
|
"""Return the operation token from a Celery task name."""
|
||||||
|
if not task_name:
|
||||||
|
return "unknown"
|
||||||
|
operation = str(task_name).split("_", 1)[0].strip()
|
||||||
|
return operation or "unknown"
|
||||||
|
|
||||||
|
|
||||||
def _record(callable_obj: Any, value: int | float, attrs: dict[str, Any]) -> None:
|
def _record(callable_obj: Any, value: int | float, attrs: dict[str, Any]) -> None:
|
||||||
if not _is_enabled():
|
if not _is_enabled():
|
||||||
return
|
return
|
||||||
|
|
@ -262,6 +305,15 @@ def _celery_heartbeat_failures():
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@lru_cache(maxsize=1)
|
||||||
|
def _celery_queue_latency():
|
||||||
|
return _get_meter().create_histogram(
|
||||||
|
"surfsense.celery.queue.latency",
|
||||||
|
unit="s",
|
||||||
|
description="Time SurfSense Celery tasks spend waiting in queue.",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def record_model_call_duration(
|
def record_model_call_duration(
|
||||||
duration_ms: float, *, model: str | None, provider: str | None
|
duration_ms: float, *, model: str | None, provider: str | None
|
||||||
) -> None:
|
) -> None:
|
||||||
|
|
@ -359,11 +411,16 @@ def record_connector_sync_duration(
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def record_connector_sync_outcome(*, connector_type: str | None, status: str) -> None:
|
def record_connector_sync_outcome(
|
||||||
|
*, connector_type: str | None, status: str, error_category: str | None = None
|
||||||
|
) -> None:
|
||||||
_add(
|
_add(
|
||||||
_connector_sync_outcome(),
|
_connector_sync_outcome(),
|
||||||
1,
|
1,
|
||||||
|
_attrs_with_optional_error_category(
|
||||||
{"connector.type": connector_type or "unknown", "status": status},
|
{"connector.type": connector_type or "unknown", "status": status},
|
||||||
|
error_category,
|
||||||
|
),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -398,11 +455,15 @@ def record_chat_request_outcome(
|
||||||
flow: str,
|
flow: str,
|
||||||
outcome: str,
|
outcome: str,
|
||||||
agent_mode: str | None = None,
|
agent_mode: str | None = None,
|
||||||
|
error_category: str | None = None,
|
||||||
) -> None:
|
) -> None:
|
||||||
_add(
|
_add(
|
||||||
_chat_request_outcome(),
|
_chat_request_outcome(),
|
||||||
1,
|
1,
|
||||||
|
_attrs_with_optional_error_category(
|
||||||
{"chat.flow": flow, "outcome": outcome, "agent.mode": agent_mode},
|
{"chat.flow": flow, "outcome": outcome, "agent.mode": agent_mode},
|
||||||
|
error_category,
|
||||||
|
),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -464,15 +525,19 @@ def record_etl_extract_outcome(
|
||||||
etl_service: str | None,
|
etl_service: str | None,
|
||||||
content_type: str | None,
|
content_type: str | None,
|
||||||
status: str,
|
status: str,
|
||||||
|
error_category: str | None = None,
|
||||||
) -> None:
|
) -> None:
|
||||||
_add(
|
_add(
|
||||||
_etl_extract_outcome(),
|
_etl_extract_outcome(),
|
||||||
1,
|
1,
|
||||||
|
_attrs_with_optional_error_category(
|
||||||
{
|
{
|
||||||
"etl.service": etl_service or "unknown",
|
"etl.service": etl_service or "unknown",
|
||||||
"content.type": content_type or "unknown",
|
"content.type": content_type or "unknown",
|
||||||
"status": status,
|
"status": status,
|
||||||
},
|
},
|
||||||
|
error_category,
|
||||||
|
),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -484,6 +549,26 @@ def record_celery_heartbeat_failure(*, heartbeat_type: str) -> None:
|
||||||
_add(_celery_heartbeat_failures(), 1, {"heartbeat.type": heartbeat_type})
|
_add(_celery_heartbeat_failures(), 1, {"heartbeat.type": heartbeat_type})
|
||||||
|
|
||||||
|
|
||||||
|
def record_celery_queue_latency(
|
||||||
|
duration_s: float,
|
||||||
|
*,
|
||||||
|
task_name: str | None,
|
||||||
|
queue: str | None,
|
||||||
|
scheduled: bool,
|
||||||
|
operation: str | None,
|
||||||
|
) -> None:
|
||||||
|
_record(
|
||||||
|
_celery_queue_latency(),
|
||||||
|
duration_s,
|
||||||
|
{
|
||||||
|
"task.name": task_name or "unknown",
|
||||||
|
"task.queue": queue or "unknown",
|
||||||
|
"task.scheduled": bool(scheduled),
|
||||||
|
"operation": operation or "unknown",
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def _runtime_snapshot_value(key: str, transform: Any = None) -> list[Any]:
|
def _runtime_snapshot_value(key: str, transform: Any = None) -> list[Any]:
|
||||||
from opentelemetry.metrics import Observation
|
from opentelemetry.metrics import Observation
|
||||||
|
|
||||||
|
|
@ -569,9 +654,12 @@ def register_runtime_observables() -> None:
|
||||||
|
|
||||||
|
|
||||||
__all__ = [
|
__all__ = [
|
||||||
|
"categorize_exception",
|
||||||
|
"parse_celery_task_label",
|
||||||
"record_auth_failure",
|
"record_auth_failure",
|
||||||
"record_celery_heartbeat_failure",
|
"record_celery_heartbeat_failure",
|
||||||
"record_celery_heartbeat_refresh",
|
"record_celery_heartbeat_refresh",
|
||||||
|
"record_celery_queue_latency",
|
||||||
"record_chat_request_duration",
|
"record_chat_request_duration",
|
||||||
"record_chat_request_outcome",
|
"record_chat_request_outcome",
|
||||||
"record_compaction_run",
|
"record_compaction_run",
|
||||||
|
|
|
||||||
|
|
@ -92,6 +92,48 @@ def is_enabled() -> bool:
|
||||||
return _ENABLED
|
return _ENABLED
|
||||||
|
|
||||||
|
|
||||||
|
def _clean_event_attrs(attrs: dict[str, Any]) -> dict[str, str | int | float | bool]:
|
||||||
|
"""Coerce event attributes to OTel-safe scalar values."""
|
||||||
|
cleaned: dict[str, str | int | float | bool] = {}
|
||||||
|
for key, value in attrs.items():
|
||||||
|
if value is None:
|
||||||
|
continue
|
||||||
|
if isinstance(value, bool | int | float):
|
||||||
|
cleaned[key] = value
|
||||||
|
continue
|
||||||
|
text = str(value)
|
||||||
|
if text:
|
||||||
|
cleaned[key] = text
|
||||||
|
return cleaned
|
||||||
|
|
||||||
|
|
||||||
|
def add_event(name: str, attributes: dict[str, Any] | None = None) -> None:
|
||||||
|
"""Attach an event to the current active span.
|
||||||
|
|
||||||
|
This is intentionally no-op and exception-safe when OTel is disabled,
|
||||||
|
unavailable, or no span is currently recording.
|
||||||
|
"""
|
||||||
|
if not _ENABLED or _ot_trace is None:
|
||||||
|
return
|
||||||
|
with contextlib.suppress(Exception):
|
||||||
|
sp = _ot_trace.get_current_span()
|
||||||
|
if sp is None or not sp.is_recording():
|
||||||
|
return
|
||||||
|
sp.add_event(
|
||||||
|
name,
|
||||||
|
attributes=_clean_event_attrs(attributes) if attributes else None,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def record_error(span_obj: Any, exc: BaseException) -> None:
|
||||||
|
"""Record an exception and mark a span as errored without re-raising."""
|
||||||
|
if not _ENABLED:
|
||||||
|
return
|
||||||
|
with contextlib.suppress(Exception):
|
||||||
|
span_obj.record_exception(exc)
|
||||||
|
span_obj.set_status(_OtStatus(_OtStatusCode.ERROR, str(exc)))
|
||||||
|
|
||||||
|
|
||||||
def _get_tracer():
|
def _get_tracer():
|
||||||
if not _OTEL_AVAILABLE:
|
if not _OTEL_AVAILABLE:
|
||||||
return None
|
return None
|
||||||
|
|
@ -452,6 +494,7 @@ def reload_for_tests() -> bool:
|
||||||
|
|
||||||
|
|
||||||
__all__ = [
|
__all__ = [
|
||||||
|
"add_event",
|
||||||
"chat_request_span",
|
"chat_request_span",
|
||||||
"compaction_span",
|
"compaction_span",
|
||||||
"connector_sync_span",
|
"connector_sync_span",
|
||||||
|
|
@ -466,6 +509,7 @@ __all__ = [
|
||||||
"kb_search_span",
|
"kb_search_span",
|
||||||
"model_call_span",
|
"model_call_span",
|
||||||
"permission_asked_span",
|
"permission_asked_span",
|
||||||
|
"record_error",
|
||||||
"reload_for_tests",
|
"reload_for_tests",
|
||||||
"span",
|
"span",
|
||||||
"subagent_invoke_span",
|
"subagent_invoke_span",
|
||||||
|
|
|
||||||
101
surfsense_backend/tests/unit/observability/test_helpers.py
Normal file
101
surfsense_backend/tests/unit/observability/test_helpers.py
Normal file
|
|
@ -0,0 +1,101 @@
|
||||||
|
"""Tests for pure observability helper functions."""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from app.observability import metrics as ot_metrics, otel as ot
|
||||||
|
|
||||||
|
pytestmark = pytest.mark.unit
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(autouse=True)
|
||||||
|
def _disable_otel(monkeypatch: pytest.MonkeyPatch):
|
||||||
|
monkeypatch.delenv("OTEL_EXPORTER_OTLP_ENDPOINT", raising=False)
|
||||||
|
monkeypatch.setenv("SURFSENSE_DISABLE_OTEL", "true")
|
||||||
|
ot.reload_for_tests()
|
||||||
|
yield
|
||||||
|
ot.reload_for_tests()
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
("task_name", "expected"),
|
||||||
|
[
|
||||||
|
("reindex_document", "reindex"),
|
||||||
|
("delete_document_background", "delete"),
|
||||||
|
("delete_folder_documents_background", "delete"),
|
||||||
|
("delete_search_space_background", "delete"),
|
||||||
|
("process_extension_document", "process"),
|
||||||
|
("process_youtube_video", "process"),
|
||||||
|
("process_file_upload", "process"),
|
||||||
|
("process_file_upload_with_document", "process"),
|
||||||
|
("process_circleback_meeting", "process"),
|
||||||
|
("generate_video_presentation", "generate"),
|
||||||
|
("generate_content_podcast", "generate"),
|
||||||
|
("cleanup_stale_indexing_notifications", "cleanup"),
|
||||||
|
("reconcile_pending_stripe_page_purchases", "reconcile"),
|
||||||
|
("reconcile_pending_stripe_token_purchases", "reconcile"),
|
||||||
|
("check_periodic_schedules", "check"),
|
||||||
|
("ai_sort_search_space", "ai"),
|
||||||
|
("index_notion_pages", "index"),
|
||||||
|
("index_github_repos", "index"),
|
||||||
|
("index_google_drive_files", "index"),
|
||||||
|
("index_composio_connector", "index"),
|
||||||
|
("index_obsidian_attachment", "index"),
|
||||||
|
("index_local_folder", "index"),
|
||||||
|
("index_uploaded_folder_files", "index"),
|
||||||
|
("noseparator", "noseparator"),
|
||||||
|
("", "unknown"),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_parse_celery_task_label(task_name: str, expected: str) -> None:
|
||||||
|
assert ot_metrics.parse_celery_task_label(task_name) == expected
|
||||||
|
|
||||||
|
|
||||||
|
def test_parse_celery_task_label_handles_none() -> None:
|
||||||
|
assert ot_metrics.parse_celery_task_label(None) == "unknown"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
("exc", "expected"),
|
||||||
|
[
|
||||||
|
(type("RateLimitError", (Exception,), {})(), "rate_limited"),
|
||||||
|
(type("AuthenticationError", (Exception,), {})(), "auth_failed"),
|
||||||
|
(type("QuotaInsufficientError", (Exception,), {})(), "quota_exhausted"),
|
||||||
|
(TimeoutError(), "timeout"),
|
||||||
|
(type("APIConnectionError", (Exception,), {})(), "network_failed"),
|
||||||
|
(type("ServiceUnavailableError", (Exception,), {})(), "server_error"),
|
||||||
|
(type("LockContentionError", (Exception,), {})(), "lock_contention"),
|
||||||
|
(type("UnsupportedFormatError", (Exception,), {})(), "unsupported_format"),
|
||||||
|
(type("ProviderError", (Exception,), {})(), "provider_error"),
|
||||||
|
(RuntimeError("plain"), "unknown"),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_categorize_exception(exc: BaseException, expected: str) -> None:
|
||||||
|
assert ot_metrics.categorize_exception(exc) == expected
|
||||||
|
|
||||||
|
|
||||||
|
def test_record_celery_queue_latency_noops_when_disabled() -> None:
|
||||||
|
ot_metrics.record_celery_queue_latency(
|
||||||
|
0.5,
|
||||||
|
task_name="index_notion_pages",
|
||||||
|
queue="surfsense.connectors",
|
||||||
|
scheduled=False,
|
||||||
|
operation="index",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def test_add_event_noops_when_disabled() -> None:
|
||||||
|
ot.add_event("test.event", {"value": 1})
|
||||||
|
|
||||||
|
|
||||||
|
def test_add_event_noops_without_current_span(monkeypatch: pytest.MonkeyPatch) -> None:
|
||||||
|
class FakeTrace:
|
||||||
|
@staticmethod
|
||||||
|
def get_current_span():
|
||||||
|
return None
|
||||||
|
|
||||||
|
monkeypatch.setattr(ot, "_ENABLED", True)
|
||||||
|
monkeypatch.setattr(ot, "_ot_trace", FakeTrace())
|
||||||
|
|
||||||
|
ot.add_event("test.event", {"value": 1})
|
||||||
Loading…
Add table
Add a link
Reference in a new issue