Merge pull request #1427 from AnishSarkar22/feat/opentelemetry

feat: OpenTelemetry integration
2026-05-25 19:15:18 +02:00 · 2026-05-23 15:50:20 -07:00 · 2026-05-23 15:50:20 -07:00 · da4ba09d88
commit da4ba09d88
parent fc3b39483b 98e3950dc8
45 changed files with 7554 additions and 4442 deletions
--- a/docker/.env.example
+++ b/docker/.env.example
@ -7,6 +7,9 @@
 # SurfSense version (use "latest" or a specific version like "0.0.14")
 SURFSENSE_VERSION=latest
 # Deployment environment: dev or production
 SURFSENSE_ENV=production
 # ------------------------------------------------------------------------------
 # Core Settings
 # ------------------------------------------------------------------------------
@ -304,6 +307,28 @@ STT_SERVICE=local/base
 # LANGSMITH_API_KEY=
 # LANGSMITH_PROJECT=surfsense
 # OpenTelemetry traces and metrics.
 # Enable the collector with: docker compose --profile observability up -d
 # SURFSENSE_ENABLE_OTEL=true
 # OTEL_EXPORTER_OTLP_ENDPOINT=http://otel-collector:4317
 # OTEL_EXPORTER_OTLP_PROTOCOL=grpc
 # OTEL_RESOURCE_ATTRIBUTES=service.namespace=surfsense
 #
 # Emergency kill switch.
 # OTEL_SDK_DISABLED=true
 #
 # Grafana Cloud OTLP credentials. These are used only by the collector container.
 # GRAFANA_CLOUD_OTLP_ENDPOINT=https://otlp-gateway-<region>.grafana.net/otlp
 # GRAFANA_CLOUD_INSTANCE_ID=
 # GRAFANA_CLOUD_API_KEY=
 #
 # Optional host port overrides for the bundled OTel Collector. Only change
 # these if the host already uses 4317/4318/13133; backend containers still use
 # the internal Docker endpoint above.
 # OTEL_GRPC_PORT=4317
 # OTEL_HTTP_PORT=4318
 # OTEL_HEALTH_PORT=13133
 # ------------------------------------------------------------------------------
 # Advanced (optional)
 # ------------------------------------------------------------------------------
--- a/docker/docker-compose.dev.yml
+++ b/docker/docker-compose.dev.yml
@ -78,6 +78,15 @@ services:
      timeout: 5s
      retries: 5
  otel-lgtm:
    image: grafana/otel-lgtm:latest
    ports:
      - "${OTEL_GRPC_PORT:-4317}:4317"
      - "${OTEL_HTTP_PORT:-4318}:4318"
      - "${OTEL_GRAFANA_PORT:-3001}:3000"
      - "${OTEL_TEMPO_PORT:-3200}:3200"
    restart: unless-stopped
  searxng:
    image: searxng/searxng:2026.3.13-3c1f68c59
    ports:
--- a/docker/docker-compose.yml
+++ b/docker/docker-compose.yml
@ -61,6 +61,29 @@ services:
      timeout: 5s
      retries: 5
  otel-collector:
    image: otel/opentelemetry-collector-contrib:0.152.1
    profiles:
      - observability
    command: ["--config=/etc/otelcol/config.yaml"]
    volumes:
      - ./otel-collector/config.yaml:/etc/otelcol/config.yaml:ro
    environment:
      GRAFANA_CLOUD_OTLP_ENDPOINT: ${GRAFANA_CLOUD_OTLP_ENDPOINT:-}
      GRAFANA_CLOUD_INSTANCE_ID: ${GRAFANA_CLOUD_INSTANCE_ID:-}
      GRAFANA_CLOUD_API_KEY: ${GRAFANA_CLOUD_API_KEY:-}
    ports:
      - "${OTEL_GRPC_PORT:-4317}:4317"
      - "${OTEL_HTTP_PORT:-4318}:4318"
      - "${OTEL_HEALTH_PORT:-13133}:13133"
    mem_limit: 2g
    restart: unless-stopped
    healthcheck:
      test: ["CMD", "/otelcol-contrib", "--version"]
      interval: 30s
      timeout: 5s
      retries: 3
  searxng:
    image: searxng/searxng:2026.3.13-3c1f68c59
    volumes:
--- a/docker/otel-collector/config.yaml
+++ b/docker/otel-collector/config.yaml
@ -0,0 +1,81 @@
 extensions:
  health_check:
    endpoint: 0.0.0.0:13133
  basicauth/grafana_cloud:
    client_auth:
      username: ${env:GRAFANA_CLOUD_INSTANCE_ID}
      password: ${env:GRAFANA_CLOUD_API_KEY}
 receivers:
  otlp:
    protocols:
      grpc:
        endpoint: 0.0.0.0:4317
      http:
        endpoint: 0.0.0.0:4318
 processors:
  # Percentage limits are calculated against the collector container memory limit.
  # Keep docker-compose.yml/Coolify memory limit set for predictability.
  memory_limiter:
    check_interval: 1s
    limit_percentage: 80
    spike_limit_percentage: 25
  attributes/scrub:
    actions:
      - key: http.request.header.authorization
        action: delete
      - key: http.request.header.cookie
        action: delete
      - key: db.statement
        action: delete
  tail_sampling:
    decision_wait: 10s
    num_traces: 50000
    expected_new_traces_per_sec: 100
    policies:
      - name: errors
        type: status_code
        status_code:
          status_codes: [ERROR]
      - name: slow-requests
        type: latency
        latency:
          threshold_ms: 500
      - name: baseline
        type: probabilistic
        probabilistic:
          sampling_percentage: 100
  batch:
    timeout: 5s
    send_batch_size: 1024
    send_batch_max_size: 2048
 exporters:
  otlp_http/grafana_cloud:
    endpoint: ${env:GRAFANA_CLOUD_OTLP_ENDPOINT}
    auth:
      authenticator: basicauth/grafana_cloud
    sending_queue:
      enabled: true
      queue_size: 10000
    retry_on_failure:
      enabled: true
      initial_interval: 5s
      max_interval: 30s
      max_elapsed_time: 300s
 service:
  extensions: [health_check, basicauth/grafana_cloud]
  pipelines:
    traces:
      receivers: [otlp]
      processors: [memory_limiter, attributes/scrub, tail_sampling, batch]
      exporters: [otlp_http/grafana_cloud]
    metrics:
      receivers: [otlp]
      processors: [memory_limiter, batch]
      exporters: [otlp_http/grafana_cloud]
--- a/surfsense_backend/.env.example
+++ b/surfsense_backend/.env.example
@ -1,5 +1,8 @@
 DATABASE_URL=postgresql+asyncpg://postgres:postgres@localhost:5432/surfsense
 # Deployment environment: dev or production
 SURFSENSE_ENV=dev
 #Celery Config
 CELERY_BROKER_URL=redis://localhost:6379/0
 CELERY_RESULT_BACKEND=redis://localhost:6379/0
@ -303,8 +306,16 @@ LANGSMITH_PROJECT=surfsense
 # SURFSENSE_ENABLE_BUSY_MUTEX=false
 # SURFSENSE_ENABLE_LLM_TOOL_SELECTOR=false   # adds a per-turn LLM call
-# Observability — OTel (also requires OTEL_EXPORTER_OTLP_ENDPOINT)
+# Observability - OTel
 # SURFSENSE_ENABLE_OTEL=false
 # OpenTelemetry - endpoint enables export; absent = no-op.
 # Production should point at an OTel Collector. For local docker-compose.dev.yml,
 # use http://otel-lgtm:4317 instead.
 # OTEL_EXPORTER_OTLP_ENDPOINT=http://otel-collector:4317
 # OTEL_EXPORTER_OTLP_PROTOCOL=grpc                        # or http/protobuf
 # OTEL_RESOURCE_ATTRIBUTES=service.namespace=surfsense
 # OTEL_METRIC_EXPORT_INTERVAL=300000                       # ms; 5 minutes
 # OTEL_SDK_DISABLED=true                                  # emergency kill-switch
 # Skills + subagents
 # SURFSENSE_ENABLE_SKILLS=false
--- a/surfsense_backend/app/agents/multi_agent_chat/middleware/main_agent/checkpointed_subagent_middleware/task_tool.py
+++ b/surfsense_backend/app/agents/multi_agent_chat/middleware/main_agent/checkpointed_subagent_middleware/task_tool.py
@ -20,6 +20,7 @@ from langchain_core.tools import StructuredTool
 from langgraph.errors import GraphInterrupt
 from langgraph.types import Command, Interrupt
 from app.observability import metrics as ot_metrics, otel as ot
 from app.utils.perf import get_perf_logger
 from .config import (
@ -173,6 +174,9 @@ def build_task_tool_with_parent_config(
                    exc_info=True,
                )
        invoke_path = "resume" if pending_value is not None else "fresh"
        invoke_start = time.perf_counter()
        invoke_outcome = "ok"
        if pending_value is not None:
            resume_value = consume_surfsense_resume(runtime)
            if resume_value is None:
@ -188,18 +192,94 @@ def build_task_tool_with_parent_config(
            # Prevent the parent's resume payload from leaking into subagent
            # interrupts via langgraph's parent_scratchpad fallback.
            drain_parent_null_resume(runtime)
            with ot.subagent_invoke_span(
                subagent_type=subagent_type, path=invoke_path
            ) as sp:
                try:
                    result = subagent.invoke(
                        build_resume_command(resume_value, pending_id),
                        config=sub_config,
                    )
                    sp.set_attribute("subagent.outcome", invoke_outcome)
                except GraphInterrupt as gi:
                    invoke_outcome = "interrupted"
                    sp.set_attribute("subagent.outcome", invoke_outcome)
                    ot_metrics.record_subagent_invoke_duration(
                        (time.perf_counter() - invoke_start) * 1000,
                        subagent_type=subagent_type,
                        path=invoke_path,
                        outcome=invoke_outcome,
                    )
                    ot_metrics.record_subagent_invoke_outcome(
                        subagent_type=subagent_type,
                        path=invoke_path,
                        outcome=invoke_outcome,
                    )
                    _reraise_stamped_subagent_interrupt(gi, runtime.tool_call_id)
                except Exception:
                    invoke_outcome = "error"
                    sp.set_attribute("subagent.outcome", invoke_outcome)
                    ot_metrics.record_subagent_invoke_duration(
                        (time.perf_counter() - invoke_start) * 1000,
                        subagent_type=subagent_type,
                        path=invoke_path,
                        outcome=invoke_outcome,
                    )
                    ot_metrics.record_subagent_invoke_outcome(
                        subagent_type=subagent_type,
                        path=invoke_path,
                        outcome=invoke_outcome,
                    )
                    raise
        else:
            with ot.subagent_invoke_span(
                subagent_type=subagent_type, path=invoke_path
            ) as sp:
                try:
                    result = subagent.invoke(subagent_state, config=sub_config)
                    sp.set_attribute("subagent.outcome", invoke_outcome)
                except GraphInterrupt as gi:
                    invoke_outcome = "interrupted"
                    sp.set_attribute("subagent.outcome", invoke_outcome)
                    ot_metrics.record_subagent_invoke_duration(
                        (time.perf_counter() - invoke_start) * 1000,
                        subagent_type=subagent_type,
                        path=invoke_path,
                        outcome=invoke_outcome,
                    )
                    ot_metrics.record_subagent_invoke_outcome(
                        subagent_type=subagent_type,
                        path=invoke_path,
                        outcome=invoke_outcome,
                    )
                    _reraise_stamped_subagent_interrupt(gi, runtime.tool_call_id)
                except Exception:
                    invoke_outcome = "error"
                    sp.set_attribute("subagent.outcome", invoke_outcome)
                    ot_metrics.record_subagent_invoke_duration(
                        (time.perf_counter() - invoke_start) * 1000,
                        subagent_type=subagent_type,
                        path=invoke_path,
                        outcome=invoke_outcome,
                    )
                    ot_metrics.record_subagent_invoke_outcome(
                        subagent_type=subagent_type,
                        path=invoke_path,
                        outcome=invoke_outcome,
                    )
                    raise
        invoke_elapsed_ms = (time.perf_counter() - invoke_start) * 1000
        ot_metrics.record_subagent_invoke_duration(
            invoke_elapsed_ms,
            subagent_type=subagent_type,
            path=invoke_path,
            outcome=invoke_outcome,
        )
        ot_metrics.record_subagent_invoke_outcome(
            subagent_type=subagent_type,
            path=invoke_path,
            outcome=invoke_outcome,
        )
        return _return_command_with_state_update(result, runtime.tool_call_id)
    async def atask(
@ -274,13 +354,29 @@ def build_task_tool_with_parent_config(
                # Prevent the parent's resume payload from leaking into subagent
                # interrupts via langgraph's parent_scratchpad fallback.
                drain_parent_null_resume(runtime)
                with ot.subagent_invoke_span(
                    subagent_type=subagent_type, path=invoke_path
                ) as sp:
                    try:
                        result = await subagent.ainvoke(
                            build_resume_command(resume_value, pending_id),
                            config=sub_config,
                        )
                        sp.set_attribute("subagent.outcome", ainvoke_outcome)
                    except GraphInterrupt as gi:
                        ainvoke_outcome = "interrupted"
                        sp.set_attribute("subagent.outcome", ainvoke_outcome)
                        ot_metrics.record_subagent_invoke_duration(
                            (time.perf_counter() - ainvoke_start) * 1000,
                            subagent_type=subagent_type,
                            path=invoke_path,
                            outcome=ainvoke_outcome,
                        )
                        ot_metrics.record_subagent_invoke_outcome(
                            subagent_type=subagent_type,
                            path=invoke_path,
                            outcome=ainvoke_outcome,
                        )
                        _perf_log.info(
                            "[hitl_route] atask EXIT subagent_type=%r path=%s outcome=%s "
                            "aget_state=%.3fs ainvoke=%.3fs total=%.3fs",
@ -292,11 +388,44 @@ def build_task_tool_with_parent_config(
                            time.perf_counter() - atask_start,
                        )
                        _reraise_stamped_subagent_interrupt(gi, runtime.tool_call_id)
                    except Exception:
                        ainvoke_outcome = "error"
                        sp.set_attribute("subagent.outcome", ainvoke_outcome)
                        ot_metrics.record_subagent_invoke_duration(
                            (time.perf_counter() - ainvoke_start) * 1000,
                            subagent_type=subagent_type,
                            path=invoke_path,
                            outcome=ainvoke_outcome,
                        )
                        ot_metrics.record_subagent_invoke_outcome(
                            subagent_type=subagent_type,
                            path=invoke_path,
                            outcome=ainvoke_outcome,
                        )
                        raise
            else:
                with ot.subagent_invoke_span(
                    subagent_type=subagent_type, path=invoke_path
                ) as sp:
                    try:
-                    result = await subagent.ainvoke(subagent_state, config=sub_config)
+                        result = await subagent.ainvoke(
                            subagent_state, config=sub_config
                        )
                        sp.set_attribute("subagent.outcome", ainvoke_outcome)
                    except GraphInterrupt as gi:
                        ainvoke_outcome = "interrupted"
                        sp.set_attribute("subagent.outcome", ainvoke_outcome)
                        ot_metrics.record_subagent_invoke_duration(
                            (time.perf_counter() - ainvoke_start) * 1000,
                            subagent_type=subagent_type,
                            path=invoke_path,
                            outcome=ainvoke_outcome,
                        )
                        ot_metrics.record_subagent_invoke_outcome(
                            subagent_type=subagent_type,
                            path=invoke_path,
                            outcome=ainvoke_outcome,
                        )
                        _perf_log.info(
                            "[hitl_route] atask EXIT subagent_type=%r path=%s outcome=%s "
                            "aget_state=%.3fs ainvoke=%.3fs total=%.3fs",
@ -308,6 +437,21 @@ def build_task_tool_with_parent_config(
                            time.perf_counter() - atask_start,
                        )
                        _reraise_stamped_subagent_interrupt(gi, runtime.tool_call_id)
                    except Exception:
                        ainvoke_outcome = "error"
                        sp.set_attribute("subagent.outcome", ainvoke_outcome)
                        ot_metrics.record_subagent_invoke_duration(
                            (time.perf_counter() - ainvoke_start) * 1000,
                            subagent_type=subagent_type,
                            path=invoke_path,
                            outcome=ainvoke_outcome,
                        )
                        ot_metrics.record_subagent_invoke_outcome(
                            subagent_type=subagent_type,
                            path=invoke_path,
                            outcome=ainvoke_outcome,
                        )
                        raise
            ainvoke_elapsed = time.perf_counter() - ainvoke_start
        except GraphInterrupt:
            raise
@ -326,6 +470,17 @@ def build_task_tool_with_parent_config(
            merge_elapsed,
            time.perf_counter() - atask_start,
        )
        ot_metrics.record_subagent_invoke_duration(
            ainvoke_elapsed * 1000,
            subagent_type=subagent_type,
            path=invoke_path,
            outcome=ainvoke_outcome,
        )
        ot_metrics.record_subagent_invoke_outcome(
            subagent_type=subagent_type,
            path=invoke_path,
            outcome=ainvoke_outcome,
        )
        return cmd
    return StructuredTool.from_function(
--- a/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/permissions/ask/request.py
+++ b/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/permissions/ask/request.py
@ -17,6 +17,7 @@ from langchain_core.tools import BaseTool
 from langgraph.types import interrupt
 from app.agents.new_chat.permissions import Rule
 from app.observability import metrics as ot_metrics
 from app.observability import otel as ot
 from .decision import normalize_permission_decision
@ -52,6 +53,8 @@ def request_permission_decision(
        ),
        ot.interrupt_span(interrupt_type=PERMISSION_ASK_INTERRUPT_TYPE),
    ):
        ot_metrics.record_permission_ask(permission=tool_name)
        ot_metrics.record_interrupt(interrupt_type=PERMISSION_ASK_INTERRUPT_TYPE)
        decision = interrupt(payload)
    return normalize_permission_decision(decision)
--- a/surfsense_backend/app/agents/new_chat/middleware/compaction.py
+++ b/surfsense_backend/app/agents/new_chat/middleware/compaction.py
@ -34,6 +34,7 @@ from deepagents.middleware.summarization import (
 )
 from langchain_core.messages import SystemMessage
 from app.observability import metrics as ot_metrics
 from app.observability import otel as ot
 if TYPE_CHECKING:
@ -178,6 +179,7 @@ class SurfSenseCompactionMiddleware(SummarizationMiddleware):
            messages_in=len(conversation_messages),
            extra={"compaction.cutoff_index": int(cutoff_index)},
        ):
            ot_metrics.record_compaction_run(reason="auto")
            messages_to_summarize, preserved_messages = super()._partition_messages(
                conversation_messages, cutoff_index
            )
--- a/surfsense_backend/app/agents/new_chat/middleware/doom_loop.py
+++ b/surfsense_backend/app/agents/new_chat/middleware/doom_loop.py
@ -47,6 +47,7 @@ from langgraph.config import get_config
 from langgraph.runtime import Runtime
 from langgraph.types import interrupt
 from app.observability import metrics as ot_metrics
 from app.observability import otel as ot
 logger = logging.getLogger(__name__)
@ -195,6 +196,7 @@ class DoomLoopMiddleware(AgentMiddleware[AgentState[ResponseT], ContextT, Respon
                "interrupt.tool": (action or {}).get("tool", "<unknown>"),
            },
        ):
            ot_metrics.record_interrupt(interrupt_type="permission_ask")
            decision = interrupt(
                {
                    "type": "permission_ask",
--- a/surfsense_backend/app/agents/new_chat/middleware/otel_span.py
+++ b/surfsense_backend/app/agents/new_chat/middleware/otel_span.py
@ -16,13 +16,14 @@ dashboards expect.
 from __future__ import annotations
 import logging
 import time
 from collections.abc import Awaitable, Callable
 from typing import TYPE_CHECKING, Any
 from langchain.agents.middleware import AgentMiddleware
 from langchain_core.messages import AIMessage, ToolMessage
-from app.observability import otel as ot
+from app.observability import metrics as ot_metrics, otel as ot
 if TYPE_CHECKING:  # pragma: no cover — type-only
    from langchain.agents.middleware.types import (
@ -62,14 +63,37 @@ class OtelSpanMiddleware(AgentMiddleware):
            return await handler(request)
        model_id, provider = _resolve_model_attrs(request)
        t0 = time.perf_counter()
        with ot.model_call_span(model_id=model_id, provider=provider) as sp:
            _annotate_model_request(sp, model_id=model_id, provider=provider)
            try:
                result = await handler(request)
            except Exception:
                ot_metrics.record_model_call_duration(
                    (time.perf_counter() - t0) * 1000,
                    model=model_id,
                    provider=provider,
                )
                # span context manager records + re-raises
                raise
            else:
-                _annotate_model_response(sp, result)
+                input_tokens, output_tokens = _annotate_model_response(
                    sp,
                    result,
                    model_id=model_id,
                    provider=provider,
                )
                ot_metrics.record_model_call_duration(
                    (time.perf_counter() - t0) * 1000,
                    model=model_id,
                    provider=provider,
                )
                ot_metrics.record_model_token_usage(
                    input_tokens=input_tokens,
                    output_tokens=output_tokens,
                    model=model_id,
                    provider=provider,
                )
                return result
    # ------------------------------------------------------------------
@ -87,9 +111,24 @@ class OtelSpanMiddleware(AgentMiddleware):
        tool_name = _resolve_tool_name(request)
        input_size = _resolve_input_size(request)
        t0 = time.perf_counter()
        with ot.tool_call_span(tool_name, input_size=input_size) as sp:
            try:
                result = await handler(request)
-            _annotate_tool_result(sp, result)
+            except Exception:
                ot_metrics.record_tool_call_duration(
                    (time.perf_counter() - t0) * 1000,
                    tool_name=tool_name,
                )
                ot_metrics.record_tool_call_error(tool_name=tool_name)
                raise
            errored = _annotate_tool_result(sp, result)
            ot_metrics.record_tool_call_duration(
                (time.perf_counter() - t0) * 1000,
                tool_name=tool_name,
            )
            if errored:
                ot_metrics.record_tool_call_error(tool_name=tool_name)
            return result
@ -154,8 +193,29 @@ def _resolve_input_size(request: Any) -> int | None:
        return None
-def _annotate_model_response(span: Any, result: Any) -> None:
+def _annotate_model_request(
    span: Any, *, model_id: str | None, provider: str | None
 ) -> None:
    try:
        span.set_attribute("gen_ai.operation.name", "chat")
        if model_id:
            span.set_attribute("gen_ai.request.model", model_id)
        if provider:
            span.set_attribute("gen_ai.provider.name", provider)
    except Exception:  # pragma: no cover — defensive
        pass
 def _annotate_model_response(
    span: Any,
    result: Any,
    *,
    model_id: str | None = None,
    provider: str | None = None,
 ) -> tuple[int | None, int | None]:
    """Best-effort: attach prompt/completion token counts when available."""
    input_tokens: int | None = None
    output_tokens: int | None = None
    try:
        # ModelResponse may be a dataclass with .result containing AIMessage
        msg: Any
@ -165,22 +225,42 @@ def _annotate_model_response(span: Any, result: Any) -> None:
            inner = getattr(result, "result", None)
            msg = inner[-1] if isinstance(inner, list) and inner else inner
        if msg is None:
-            return
+            return None, None
        if provider:
            span.set_attribute("gen_ai.provider.name", provider)
        if model_id:
            span.set_attribute("gen_ai.request.model", model_id)
        response_model = getattr(msg, "response_metadata", {}) or {}
        if isinstance(response_model, dict):
            response_model = (
                response_model.get("model_name")
                or response_model.get("model")
                or response_model.get("model_id")
            )
        if not response_model:
            response_model = model_id
        if response_model:
            span.set_attribute("gen_ai.response.model", str(response_model))
        span.set_attribute("gen_ai.operation.name", "chat")
        usage = getattr(msg, "usage_metadata", None) or {}
        if isinstance(usage, dict):
            if (n := usage.get("input_tokens")) is not None:
-                span.set_attribute("tokens.prompt", int(n))
+                input_tokens = int(n)
                span.set_attribute("gen_ai.usage.input_tokens", input_tokens)
            if (n := usage.get("output_tokens")) is not None:
-                span.set_attribute("tokens.completion", int(n))
+                output_tokens = int(n)
                span.set_attribute("gen_ai.usage.output_tokens", output_tokens)
            if (n := usage.get("total_tokens")) is not None:
-                span.set_attribute("tokens.total", int(n))
+                span.set_attribute("gen_ai.usage.total_tokens", int(n))
        tool_calls = getattr(msg, "tool_calls", None) or []
        span.set_attribute("model.tool_calls", len(tool_calls))
    except Exception:  # pragma: no cover — defensive
        pass
    return input_tokens, output_tokens
-def _annotate_tool_result(span: Any, result: Any) -> None:
+def _annotate_tool_result(span: Any, result: Any) -> bool:
    errored = False
    try:
        if isinstance(result, ToolMessage):
            content = (
@ -192,11 +272,14 @@ def _annotate_tool_result(span: Any, result: Any) -> None:
            status = getattr(result, "status", None)
            if isinstance(status, str):
                span.set_attribute("tool.status", status)
                errored = status.lower() == "error"
            kwargs = getattr(result, "additional_kwargs", None) or {}
            if isinstance(kwargs, dict) and kwargs.get("error"):
                span.set_attribute("tool.error", True)
                errored = True
    except Exception:  # pragma: no cover — defensive
        pass
    return errored
 __all__ = ["OtelSpanMiddleware"]
--- a/surfsense_backend/app/agents/new_chat/middleware/permission.py
+++ b/surfsense_backend/app/agents/new_chat/middleware/permission.py
@ -61,6 +61,7 @@ from app.agents.new_chat.permissions import (
    aggregate_action,
    evaluate_many,
 )
 from app.observability import metrics as ot_metrics
 from app.observability import otel as ot
 logger = logging.getLogger(__name__)
@ -284,6 +285,8 @@ class PermissionMiddleware(AgentMiddleware):  # type: ignore[type-arg]
            ),
            ot.interrupt_span(interrupt_type="permission_ask"),
        ):
            ot_metrics.record_permission_ask(permission=tool_name)
            ot_metrics.record_interrupt(interrupt_type="permission_ask")
            decision = interrupt(payload)
        return _normalize_permission_decision(decision)
--- a/surfsense_backend/app/agents/new_chat/middleware/retry_after.py
+++ b/surfsense_backend/app/agents/new_chat/middleware/retry_after.py
@ -45,6 +45,8 @@ from langchain.agents.middleware.types import (
 from langchain_core.callbacks import adispatch_custom_event, dispatch_custom_event
 from langchain_core.messages import AIMessage
 from app.observability import metrics as ot_metrics, otel as ot
 logger = logging.getLogger(__name__)
 # Names of exception classes for which a retry would not help — context
@ -198,6 +200,15 @@ class RetryAfterMiddleware(AgentMiddleware[AgentState[ResponseT], ContextT, Resp
                if not self._should_retry(exc) or attempt >= self.max_retries:
                    raise
                delay = self._delay_for_attempt(attempt, exc)
                ot.add_event(
                    "model.retry.scheduled",
                    {
                        "retry.attempt": attempt + 1,
                        "retry.max": self.max_retries,
                        "retry.delay_ms": int(delay * 1000),
                        "retry.reason": ot_metrics.categorize_exception(exc),
                    },
                )
                try:
                    dispatch_custom_event(
                        "surfsense.retrying",
@ -231,6 +242,15 @@ class RetryAfterMiddleware(AgentMiddleware[AgentState[ResponseT], ContextT, Resp
                if not self._should_retry(exc) or attempt >= self.max_retries:
                    raise
                delay = self._delay_for_attempt(attempt, exc)
                ot.add_event(
                    "model.retry.scheduled",
                    {
                        "retry.attempt": attempt + 1,
                        "retry.max": self.max_retries,
                        "retry.delay_ms": int(delay * 1000),
                        "retry.reason": ot_metrics.categorize_exception(exc),
                    },
                )
                try:
                    await adispatch_custom_event(
                        "surfsense.retrying",
--- a/surfsense_backend/app/agents/new_chat/middleware/scoped_model_fallback.py
+++ b/surfsense_backend/app/agents/new_chat/middleware/scoped_model_fallback.py
@ -6,6 +6,8 @@ from typing import TYPE_CHECKING, Any
 from langchain.agents.middleware import ModelFallbackMiddleware
 from app.observability import metrics as ot_metrics, otel as ot
 if TYPE_CHECKING:
    from collections.abc import Awaitable, Callable
@ -55,7 +57,16 @@ class ScopedModelFallbackMiddleware(ModelFallbackMiddleware):
                raise
            last_exception = e
-        for fallback_model in self.models:
+        for attempt, fallback_model in enumerate(self.models, start=1):
            ot.add_event(
                "model.fallback",
                {
                    "fallback.attempt": attempt,
                    "fallback.from": attempt - 1,
                    "fallback.to": attempt,
                    "fallback.reason": ot_metrics.categorize_exception(last_exception),
                },
            )
            try:
                return handler(request.override(model=fallback_model))
            except Exception as e:
@ -79,7 +90,16 @@ class ScopedModelFallbackMiddleware(ModelFallbackMiddleware):
                raise
            last_exception = e
-        for fallback_model in self.models:
+        for attempt, fallback_model in enumerate(self.models, start=1):
            ot.add_event(
                "model.fallback",
                {
                    "fallback.attempt": attempt,
                    "fallback.from": attempt - 1,
                    "fallback.to": attempt,
                    "fallback.reason": ot_metrics.categorize_exception(last_exception),
                },
            )
            try:
                return await handler(request.override(model=fallback_model))
            except Exception as e:
--- a/surfsense_backend/app/app.py
+++ b/surfsense_backend/app/app.py
@ -1,4 +1,5 @@
 import asyncio
 import contextlib
 import gc
 import logging
 import time
@ -36,13 +37,15 @@ from app.config import (
 )
 from app.db import User, create_db_and_tables, get_async_session
 from app.exceptions import GENERIC_5XX_MESSAGE, ISSUES_URL, SurfSenseError
 from app.observability import metrics as ot_metrics
 from app.observability.bootstrap import init_otel, shutdown_otel
 from app.rate_limiter import get_real_client_ip, limiter
 from app.routes import router as crud_router
 from app.routes.auth_routes import router as auth_router
 from app.schemas import UserCreate, UserRead, UserUpdate
 from app.tasks.surfsense_docs_indexer import seed_surfsense_docs
 from app.users import SECRET, auth_backend, current_active_user, fastapi_users
-from app.utils.perf import get_perf_logger, log_system_snapshot
+from app.utils.perf import log_system_snapshot
 _error_logger = logging.getLogger("surfsense.errors")
@ -127,6 +130,8 @@ def _http_exception_handler(request: Request, exc: HTTPException) -> JSONRespons
      logged server-side.
    """
    rid = _get_request_id(request)
    if exc.status_code in {401, 403} and request.url.path.startswith("/auth"):
        ot_metrics.record_auth_failure(reason=_status_to_code(exc.status_code))
    should_sanitize = exc.status_code == 500
    # Structured dict details (e.g. {"code": "CAPTCHA_REQUIRED", "message": "..."})
@ -213,6 +218,7 @@ def _validation_error_handler(
 def _unhandled_exception_handler(request: Request, exc: Exception) -> JSONResponse:
    """Catch-all: log full traceback, return sanitized 500."""
    rid = _get_request_id(request)
    ot_metrics.record_auth_failure(reason="unhandled_exception")
    _error_logger.error(
        "[%s] Unhandled exception on %s %s",
        rid,
@ -246,6 +252,7 @@ def _status_to_code(status_code: int, detail: str = "") -> str:
 def _rate_limit_exceeded_handler(request: Request, exc: RateLimitExceeded):
    """Custom 429 handler that returns JSON matching our error envelope."""
    rid = _get_request_id(request)
    ot_metrics.record_rate_limit_rejection(scope="slowapi")
    retry_after = exc.detail.split("per")[-1].strip() if exc.detail else "60"
    return _build_error_response(
        429,
@ -306,6 +313,7 @@ def _check_rate_limit_memory(
                f"Rate limit exceeded (in-memory fallback) on {scope} for IP {client_ip} "
                f"({len(timestamps)}/{max_requests} in {window_seconds}s)"
            )
            ot_metrics.record_rate_limit_rejection(scope=scope)
            raise HTTPException(
                status_code=429,
                detail="RATE_LIMIT_EXCEEDED",
@ -349,6 +357,7 @@ def _check_rate_limit(
            f"Rate limit exceeded on {scope} for IP {client_ip} "
            f"({current_count}/{max_requests} in {window_seconds}s)"
        )
        ot_metrics.record_rate_limit_rejection(scope=scope)
        raise HTTPException(
            status_code=429,
            detail="RATE_LIMIT_EXCEEDED",
@ -558,6 +567,7 @@ async def lifespan(app: FastAPI):
    gc.set_threshold(700, 10, 5)
    _enable_slow_callback_logging(threshold_sec=0.5)
    init_otel(app)
    await create_db_and_tables()
    await setup_checkpointer_tables()
    initialize_openrouter_integration()
@ -592,6 +602,7 @@ async def lifespan(app: FastAPI):
    _stop_openrouter_background_refresh()
    await close_checkpointer()
    shutdown_otel()
 def registration_allowed():
@ -676,32 +687,20 @@ class RequestPerfMiddleware(BaseHTTPMiddleware):
    async def dispatch(
        self, request: StarletteRequest, call_next: RequestResponseEndpoint
    ) -> StarletteResponse:
        perf = get_perf_logger()
        t0 = time.perf_counter()
        response = await call_next(request)
        elapsed_ms = (time.perf_counter() - t0) * 1000
        path = request.url.path
        method = request.method
        status = response.status_code
        perf.debug(
            "[request] %s %s -> %d in %.1fms",
            method,
            path,
            status,
            elapsed_ms,
        )
        if elapsed_ms > _PERF_SLOW_REQUEST_THRESHOLD:
-            perf.warning(
+            with contextlib.suppress(Exception):
-                "[SLOW_REQUEST] %s %s -> %d in %.1fms (threshold=%.0fms)",
+                from opentelemetry import trace
-                method,
+
-                path,
+                span = trace.get_current_span()
-                status,
+                span.set_attribute("slow_request", True)
-                elapsed_ms,
+                span.set_attribute("surfsense.request.elapsed_ms", elapsed_ms)
-                _PERF_SLOW_REQUEST_THRESHOLD,
+                span.set_attribute("http.route", path)
            )
            log_system_snapshot("slow_request")
        return response
--- a/surfsense_backend/app/celery_app.py
+++ b/surfsense_backend/app/celery_app.py
@ -1,16 +1,103 @@
 """Celery application configuration and setup."""
 import contextlib
 import os
 import time
 from celery import Celery
 from celery.schedules import crontab
-from celery.signals import worker_process_init
+from celery.signals import (
    before_task_publish,
    task_postrun,
    task_prerun,
    worker_process_init,
 )
 from dotenv import load_dotenv
 try:
    from opentelemetry import trace
 except ImportError:  # pragma: no cover - optional OTel dependency
    trace = None  # type: ignore[assignment]
 # Load environment variables
 load_dotenv()
@before_task_publish.connect
 def _stamp_enqueue_time(headers=None, **_kwargs):
    """Stamp enqueue time so workers can measure queue wait."""
    if headers is None:
        return
    with contextlib.suppress(Exception):
        headers["surfsense.enqueued_at_ns"] = str(time.monotonic_ns())
@task_prerun.connect
 def _record_queue_latency(task=None, **_kwargs):
    """Record queue latency and stash task metadata for span enrichment."""
    if task is None:
        return
    try:
        from app.observability import metrics as ot_metrics
        task_name = getattr(task, "name", None) or "unknown"
        operation = ot_metrics.parse_celery_task_label(task_name)
        request = getattr(task, "request", None)
        delivery_info = getattr(request, "delivery_info", None) or {}
        queue = delivery_info.get("routing_key") or "unknown"
        scheduled = bool(
            getattr(request, "eta", None) or getattr(request, "expires", None)
        )
        with contextlib.suppress(Exception):
            request.surfsense_operation = operation
            request.surfsense_queue = queue
            request.surfsense_scheduled = scheduled
        headers = getattr(request, "headers", None) or {}
        enqueued_ns = headers.get("surfsense.enqueued_at_ns")
        if enqueued_ns is None:
            return
        elapsed_s = (time.monotonic_ns() - int(enqueued_ns)) / 1e9
        with contextlib.suppress(Exception):
            request.surfsense_queue_latency_ms = elapsed_s * 1000
        ot_metrics.record_celery_queue_latency(
            elapsed_s,
            task_name=task_name,
            queue=queue,
            scheduled=scheduled,
            operation=operation,
        )
    except Exception:
        pass
@task_postrun.connect
 def _set_celery_span_attributes(task=None, **_kwargs):
    """Attach derived queue metadata to the active Celery run span."""
    if task is None or trace is None:
        return
    try:
        request = getattr(task, "request", None)
        if request is None:
            return
        span = trace.get_current_span()
        operation = getattr(request, "surfsense_operation", None)
        if operation:
            span.set_attribute("celery.task.operation", operation)
        latency_ms = getattr(request, "surfsense_queue_latency_ms", None)
        if latency_ms is not None:
            span.set_attribute("celery.queue.latency_ms", latency_ms)
    except Exception:
        pass
@worker_process_init.connect
 def init_worker(**kwargs):
    """Initialize the LLM Router and Image Gen Router when a Celery worker process starts.
@ -18,6 +105,10 @@ def init_worker(**kwargs):
    This ensures the Auto mode (LiteLLM Router) is available for background tasks
    like document summarization and image generation.
    """
    from app.observability.bootstrap import init_otel
    init_otel(app=None, traces=True, metrics=True, logs=True)
    from app.config import (
        initialize_image_gen_router,
        initialize_llm_router,
--- a/surfsense_backend/app/etl_pipeline/etl_pipeline_service.py
+++ b/surfsense_backend/app/etl_pipeline/etl_pipeline_service.py
@ -1,4 +1,7 @@
 import contextlib
 import logging
 import time
 from pathlib import PurePosixPath
 from app.config import config as app_config
 from app.etl_pipeline.etl_document import EtlRequest, EtlResult
@ -10,6 +13,11 @@ from app.etl_pipeline.file_classifier import FileCategory, classify_file
 from app.etl_pipeline.parsers.audio import transcribe_audio
 from app.etl_pipeline.parsers.direct_convert import convert_file_directly
 from app.etl_pipeline.parsers.plaintext import read_plaintext
 from app.observability import metrics as ot_metrics, otel as ot
 def _file_extension(filename: str) -> str:
    return PurePosixPath(filename).suffix.lower() or "none"
 class EtlPipelineService:
@ -20,7 +28,16 @@ class EtlPipelineService:
    async def extract(self, request: EtlRequest) -> EtlResult:
        category = classify_file(request.filename)
-
+        start = time.perf_counter()
        status = "success"
        error_category: str | None = None
        result: EtlResult | None = None
        with ot.etl_extract_span(
            content_type=category.value,
            file_extension=_file_extension(request.filename),
            processing_mode=request.processing_mode.value,
        ) as sp:
            try:
                if category == FileCategory.UNSUPPORTED:
                    raise EtlUnsupportedFileError(
                        f"File type not supported for parsing: {request.filename}"
@ -28,41 +45,74 @@ class EtlPipelineService:
                if category == FileCategory.PLAINTEXT:
                    content = read_plaintext(request.file_path)
-            return EtlResult(
+                    result = EtlResult(
                        markdown_content=content,
                        etl_service="PLAINTEXT",
                        content_type="plaintext",
                    )
                    return result
                if category == FileCategory.DIRECT_CONVERT:
                    content = convert_file_directly(request.file_path, request.filename)
-            return EtlResult(
+                    result = EtlResult(
                        markdown_content=content,
                        etl_service="DIRECT_CONVERT",
                        content_type="direct_convert",
                    )
                    return result
                if category == FileCategory.AUDIO:
                    content = await transcribe_audio(request.file_path, request.filename)
-            return EtlResult(
+                    result = EtlResult(
                        markdown_content=content,
                        etl_service="AUDIO",
                        content_type="audio",
                    )
                    return result
                if category == FileCategory.IMAGE:
-            return await self._extract_image(request)
+                    result = await self._extract_image(request)
                    return result
-        return await self._extract_document(request)
+                result = await self._extract_document(request)
                return result
            except Exception as exc:
                status = "error"
                error_category = ot_metrics.categorize_exception(exc)
                raise
            finally:
                with contextlib.suppress(Exception):
                    if result is not None:
                        sp.set_attribute("etl.service", result.etl_service)
                        sp.set_attribute("content.type", result.content_type)
                    sp.set_attribute("etl.status", status)
                    ot_metrics.record_etl_extract_duration(
                        time.perf_counter() - start,
                        etl_service=result.etl_service if result else None,
                        content_type=result.content_type if result else category.value,
                        status=status,
                    )
                    ot_metrics.record_etl_extract_outcome(
                        etl_service=result.etl_service if result else None,
                        content_type=result.content_type if result else category.value,
                        status=status,
                        error_category=error_category,
                    )
    async def _extract_image(self, request: EtlRequest) -> EtlResult:
        if self._vision_llm:
            try:
                from app.etl_pipeline.parsers.vision_llm import parse_with_vision_llm
                with ot.etl_parse_span(
                    etl_service="VISION_LLM",
                    content_type="image",
                    file_extension=_file_extension(request.filename),
                ) as sp:
                    content = await parse_with_vision_llm(
                        request.file_path, request.filename, self._vision_llm
                    )
                    sp.set_attribute("etl.status", "success")
                return EtlResult(
                    markdown_content=content,
                    etl_service="VISION_LLM",
@ -87,13 +137,33 @@ class EtlPipelineService:
                        request.filename,
                        exc_info=True,
                    )
                ot.add_event(
                    "etl.fallback",
                    {
                        "fallback.from": "vision_llm",
                        "fallback.to": "document_parser",
                        "fallback.reason": ot_metrics.categorize_exception(exc),
                    },
                )
        else:
            logging.info(
                "No vision LLM provided, falling back to document parser for %s",
                request.filename,
            )
            ot.add_event(
                "etl.fallback",
                {
                    "fallback.from": "vision_llm",
                    "fallback.to": "document_parser",
                    "fallback.reason": "not_configured",
                },
            )
        try:
            with ot.etl_ocr_span(
                etl_service=app_config.ETL_SERVICE,
                file_extension=_file_extension(request.filename),
            ):
                return await self._extract_document(request)
        except (EtlUnsupportedFileError, EtlServiceUnavailableError):
            raise EtlUnsupportedFileError(
@ -121,18 +191,27 @@ class EtlPipelineService:
                f"File type {ext} is not supported by {etl_service}"
            )
        with ot.etl_parse_span(
            etl_service=etl_service,
            content_type="document",
            file_extension=ext,
            processing_mode=request.processing_mode.value,
        ) as sp:
            if etl_service == "DOCLING":
                from app.etl_pipeline.parsers.docling import parse_with_docling
                content = await parse_with_docling(request.file_path, request.filename)
            elif etl_service == "UNSTRUCTURED":
-            from app.etl_pipeline.parsers.unstructured import parse_with_unstructured
+                from app.etl_pipeline.parsers.unstructured import (
                    parse_with_unstructured,
                )
                content = await parse_with_unstructured(request.file_path)
            elif etl_service == "LLAMACLOUD":
                content = await self._extract_with_llamacloud(request)
            else:
                raise EtlServiceUnavailableError(f"Unknown ETL_SERVICE: {etl_service}")
            sp.set_attribute("etl.status", "success")
        # When the operator opts into vision-LLM at ingest, walk the
        # original file's embedded images and append a structured
@ -171,9 +250,14 @@ class EtlPipelineService:
        async def _ocr_image(image_path: str, image_name: str) -> str:
            try:
                sub = EtlPipelineService(vision_llm=None)
                with ot.etl_picture_ocr_span(
                    file_extension=_file_extension(image_name)
                ) as sp:
                    ocr_result = await sub.extract(
                        EtlRequest(file_path=image_path, filename=image_name)
                    )
                    sp.set_attribute("etl.service", ocr_result.etl_service)
                    sp.set_attribute("etl.status", "success")
            except (
                EtlUnsupportedFileError,
                EtlServiceUnavailableError,
@ -181,20 +265,42 @@ class EtlPipelineService:
                # Common case: the configured ETL service can't OCR
                # this image format (or no service is configured at
                # all). Don't spam warnings -- just no OCR for it.
                ot.add_event(
                    "etl.ocr.skipped",
                    {
                        "skip.reason": "unsupported_format",
                        "error.category": ot_metrics.categorize_exception(exc),
                    },
                )
                logging.debug("Skipping per-image OCR for %s: %s", image_name, exc)
                return ""
            return ocr_result.markdown_content
        try:
            with ot.etl_picture_describe_span() as sp:
                result = await describe_pictures(
                    request.file_path,
                    request.filename,
                    self._vision_llm,
                    ocr_runner=_ocr_image,
                )
-        except Exception:
+                sp.set_attribute("image.described.count", len(result.descriptions))
                sp.set_attribute("image.failed.count", result.failed)
                sp.set_attribute("image.skipped.too_small", result.skipped_too_small)
                sp.set_attribute("image.skipped.too_large", result.skipped_too_large)
                sp.set_attribute("image.skipped.duplicate", result.skipped_duplicate)
                sp.set_attribute("etl.status", "success")
        except Exception as exc:
            # Picture description is additive; never let it fail an
            # otherwise-successful document extraction.
            ot.add_event(
                "etl.degraded",
                {
                    "degraded.reason": "picture_describe_failed",
                    "degraded.action": "return_parser_output",
                    "error.category": ot_metrics.categorize_exception(exc),
                },
            )
            logging.warning(
                "Picture description failed for %s, returning parser output unchanged",
                request.filename,
@ -247,7 +353,15 @@ class EtlPipelineService:
                return await parse_with_azure_doc_intelligence(
                    request.file_path, processing_mode=mode_value
                )
-            except Exception:
+            except Exception as exc:
                ot.add_event(
                    "etl.fallback",
                    {
                        "fallback.from": "azure_di",
                        "fallback.to": "llamacloud",
                        "fallback.reason": ot_metrics.categorize_exception(exc),
                    },
                )
                logging.warning(
                    "Azure Document Intelligence failed for %s, "
                    "falling back to LlamaCloud",
--- a/surfsense_backend/app/indexing_pipeline/indexing_pipeline_service.py
+++ b/surfsense_backend/app/indexing_pipeline/indexing_pipeline_service.py
@ -2,6 +2,7 @@ import asyncio
 import contextlib
 import hashlib
 import logging
 import sys
 import time
 from collections.abc import Awaitable, Callable
 from dataclasses import dataclass, field
@ -57,6 +58,7 @@ from app.indexing_pipeline.pipeline_logger import (
    log_retryable_llm_error,
    log_unexpected_error,
 )
 from app.observability import metrics as ot_metrics, otel as ot
 from app.utils.perf import get_perf_logger
@ -362,6 +364,16 @@ class IndexingPipelineService:
        )
        perf = get_perf_logger()
        t_index = time.perf_counter()
        document_type = (
            document.document_type.value
            if getattr(document, "document_type", None)
            else None
        )
        persist_span_cm = ot.kb_persist_span(
            document_type=document_type,
        )
        persist_span = persist_span_cm.__enter__()
        outcome_status = "failed"
        try:
            log_index_started(ctx)
            document.status = DocumentStatus.processing()
@ -429,34 +441,41 @@ class IndexingPipelineService:
                time.perf_counter() - t_index,
            )
            log_index_success(ctx, chunk_count=len(chunks))
            outcome_status = "success"
            await self._enqueue_ai_sort_if_enabled(document)
        except RETRYABLE_LLM_ERRORS as e:
            ot.record_error(persist_span, e)
            log_retryable_llm_error(ctx, e)
            outcome_status = "requeued"
            await rollback_and_persist_failure(
                self.session, document, llm_retryable_message(e)
            )
        except PERMANENT_LLM_ERRORS as e:
            ot.record_error(persist_span, e)
            log_permanent_llm_error(ctx, e)
            await rollback_and_persist_failure(
                self.session, document, llm_permanent_message(e)
            )
        except RecursionError as e:
            ot.record_error(persist_span, e)
            log_chunking_overflow(ctx, e)
            await rollback_and_persist_failure(
                self.session, document, PipelineMessages.CHUNKING_OVERFLOW
            )
        except EMBEDDING_ERRORS as e:
            ot.record_error(persist_span, e)
            log_embedding_error(ctx, e)
            await rollback_and_persist_failure(
                self.session, document, embedding_message(e)
            )
        except Exception as e:
            ot.record_error(persist_span, e)
            log_unexpected_error(ctx, e)
            await rollback_and_persist_failure(
                self.session, document, safe_exception_message(e)
@ -465,6 +484,17 @@ class IndexingPipelineService:
        with contextlib.suppress(Exception):
            await self.session.refresh(document)
        with contextlib.suppress(Exception):
            persist_span.set_attribute("indexing.status", outcome_status)
        ot_metrics.record_indexing_document_duration(
            time.perf_counter() - t_index,
            document_type=document_type,
        )
        ot_metrics.record_indexing_document_outcome(
            document_type=document_type,
            status=outcome_status,
        )
        persist_span_cm.__exit__(*sys.exc_info())
        return document
    async def _enqueue_ai_sort_if_enabled(self, document: Document) -> None:
--- a/surfsense_backend/app/observability/init.py
+++ b/surfsense_backend/app/observability/init.py
@ -5,3 +5,5 @@ small wrapper around the optional ``opentelemetry`` instrumentation. The
 wrapper is a no-op when OTEL is not configured, so importing it from
 performance-critical paths is safe.
 """
 __all__ = ["bootstrap", "metrics", "otel"]
--- a/surfsense_backend/app/observability/bootstrap.py
+++ b/surfsense_backend/app/observability/bootstrap.py
@ -0,0 +1,390 @@
 """Programmatic OpenTelemetry bootstrap for SurfSense backend processes."""
 from __future__ import annotations
 import contextlib
 import logging
 import os
 import socket
 from importlib import metadata
 from typing import Any
 from urllib.parse import urlsplit, urlunsplit
 from app.observability import otel
 logger = logging.getLogger(__name__)
 _BOOL_TRUE = {"1", "true", "yes", "on"}
 _TRACES_INITIALIZED = False
 _METRICS_INITIALIZED = False
 _LOGS_INITIALIZED = False
 _FASTAPI_INSTRUMENTED = False
 _SQLALCHEMY_INSTRUMENTED = False
 _PSYCOPG_INSTRUMENTED = False
 _REDIS_INSTRUMENTED = False
 _HTTPX_INSTRUMENTED = False
 _CELERY_INSTRUMENTED = False
 _TRACER_PROVIDER: Any | None = None
 _METER_PROVIDER: Any | None = None
 def _env_truthy(name: str) -> bool:
    return os.environ.get(name, "").strip().lower() in _BOOL_TRUE
 def is_otel_disabled() -> bool:
    """Return true when either SurfSense or OTel's spec kill switch is set."""
    return _env_truthy("SURFSENSE_DISABLE_OTEL") or _env_truthy("OTEL_SDK_DISABLED")
 def is_otel_configured() -> bool:
    """Return true when this process should export OTel signals."""
    return bool(
        os.environ.get("OTEL_EXPORTER_OTLP_ENDPOINT")
        or os.environ.get("OTEL_EXPORTER_OTLP_TRACES_ENDPOINT")
        or os.environ.get("OTEL_EXPORTER_OTLP_METRICS_ENDPOINT")
    )
 def _package_version() -> str:
    with contextlib.suppress(metadata.PackageNotFoundError):
        return metadata.version("surf-new-backend")
    return "unknown"
 def _deployment_environment() -> str:
    return os.environ.get("SURFSENSE_ENV", "dev")
 def _build_resource():
    from opentelemetry.sdk.resources import Resource
    deployment_environment = _deployment_environment()
    return Resource.create(
        {
            "service.name": os.environ.get("OTEL_SERVICE_NAME", "surfsense-backend"),
            "service.version": _package_version(),
            "service.instance.id": socket.gethostname(),
            "deployment.environment.name": deployment_environment,
            # Compatibility alias for Grafana onboarding checks that still use
            # the older semantic-convention key.
            "deployment.environment": deployment_environment,
        }
    )
 def _otlp_protocol() -> str:
    return os.environ.get("OTEL_EXPORTER_OTLP_PROTOCOL", "grpc").strip().lower()
 def _trace_exporter():
    if _otlp_protocol() == "http/protobuf":
        from opentelemetry.exporter.otlp.proto.http.trace_exporter import (
            OTLPSpanExporter,
        )
        endpoint = os.environ.get("OTEL_EXPORTER_OTLP_TRACES_ENDPOINT")
        return OTLPSpanExporter(endpoint=endpoint) if endpoint else OTLPSpanExporter()
    from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter
    endpoint = os.environ.get("OTEL_EXPORTER_OTLP_TRACES_ENDPOINT")
    return OTLPSpanExporter(endpoint=endpoint) if endpoint else OTLPSpanExporter()
 def _metric_exporter():
    if _otlp_protocol() == "http/protobuf":
        from opentelemetry.exporter.otlp.proto.http.metric_exporter import (
            OTLPMetricExporter,
        )
        endpoint = os.environ.get("OTEL_EXPORTER_OTLP_METRICS_ENDPOINT")
        return (
            OTLPMetricExporter(endpoint=endpoint) if endpoint else OTLPMetricExporter()
        )
    from opentelemetry.exporter.otlp.proto.grpc.metric_exporter import (
        OTLPMetricExporter,
    )
    endpoint = os.environ.get("OTEL_EXPORTER_OTLP_METRICS_ENDPOINT")
    return OTLPMetricExporter(endpoint=endpoint) if endpoint else OTLPMetricExporter()
 def _safe_instrument(name: str, instrument: Any) -> bool:
    try:
        instrument()
    except Exception:
        logger.warning("OpenTelemetry %s instrumentation failed", name, exc_info=True)
        return False
    return True
 def _url_without_query(raw_url: Any) -> str | None:
    try:
        parts = urlsplit(str(raw_url))
    except Exception:
        return None
    if not parts.scheme or not parts.netloc:
        return None
    return urlunsplit((parts.scheme, parts.netloc, parts.path or "/", "", ""))
 def _sanitize_http_span_url(span: Any, request: Any) -> None:
    sanitized = _url_without_query(getattr(request, "url", None))
    if not sanitized:
        return
    with contextlib.suppress(Exception):
        # Keep both old and current semantic-convention names safe. The
        # collector can drop one later without needing application changes.
        span.set_attribute("http.url", sanitized)
        span.set_attribute("url.full", sanitized)
 def _instrument_fastapi(app: Any | None) -> None:
    global _FASTAPI_INSTRUMENTED
    if app is None or _FASTAPI_INSTRUMENTED:
        return
    def _run() -> None:
        from opentelemetry.instrumentation.fastapi import FastAPIInstrumentor
        FastAPIInstrumentor.instrument_app(
            app,
            excluded_urls="/health,/ready,/metrics",
        )
    if _safe_instrument("FastAPI", _run):
        _FASTAPI_INSTRUMENTED = True
 def instrument_sqlalchemy_engine(engine: Any) -> None:
    """Instrument a SQLAlchemy engine once per process."""
    global _SQLALCHEMY_INSTRUMENTED
    if _SQLALCHEMY_INSTRUMENTED:
        return
    def _run() -> None:
        from opentelemetry.instrumentation.sqlalchemy import SQLAlchemyInstrumentor
        SQLAlchemyInstrumentor().instrument(
            engine=getattr(engine, "sync_engine", engine),
            enable_commenter=True,
        )
    if _safe_instrument("SQLAlchemy", _run):
        _SQLALCHEMY_INSTRUMENTED = True
 def _instrument_sqlalchemy() -> None:
    if _SQLALCHEMY_INSTRUMENTED:
        return
    with contextlib.suppress(Exception):
        from app.db import engine
        instrument_sqlalchemy_engine(engine)
 def _instrument_psycopg() -> None:
    global _PSYCOPG_INSTRUMENTED
    if _PSYCOPG_INSTRUMENTED:
        return
    def _run() -> None:
        from opentelemetry.instrumentation.psycopg import PsycopgInstrumentor
        PsycopgInstrumentor().instrument()
    if _safe_instrument("psycopg", _run):
        _PSYCOPG_INSTRUMENTED = True
 def _instrument_redis() -> None:
    global _REDIS_INSTRUMENTED
    if _REDIS_INSTRUMENTED:
        return
    def _run() -> None:
        from opentelemetry.instrumentation.redis import RedisInstrumentor
        RedisInstrumentor().instrument()
    if _safe_instrument("Redis", _run):
        _REDIS_INSTRUMENTED = True
 def _instrument_httpx() -> None:
    global _HTTPX_INSTRUMENTED
    if _HTTPX_INSTRUMENTED:
        return
    def _run() -> None:
        from opentelemetry.instrumentation.httpx import HTTPXClientInstrumentor
        HTTPXClientInstrumentor().instrument(
            request_hook=lambda span, request: _sanitize_http_span_url(span, request),
            response_hook=lambda span, request, _response: _sanitize_http_span_url(
                span, request
            ),
        )
    if _safe_instrument("HTTPX", _run):
        _HTTPX_INSTRUMENTED = True
 def instrument_celery() -> None:
    """Instrument Celery producer/consumer hooks once per process."""
    global _CELERY_INSTRUMENTED
    if _CELERY_INSTRUMENTED:
        return
    def _run() -> None:
        from opentelemetry.instrumentation.celery import CeleryInstrumentor
        CeleryInstrumentor().instrument()
    if _safe_instrument("Celery", _run):
        _CELERY_INSTRUMENTED = True
 def _instrument_libraries(app: Any | None) -> None:
    _instrument_fastapi(app)
    _instrument_sqlalchemy()
    _instrument_psycopg()
    _instrument_redis()
    _instrument_httpx()
    instrument_celery()
 def init_traces(app: Any | None = None) -> None:
    """Install the tracer provider, span processor, exporter, and instrumentors."""
    global _TRACER_PROVIDER, _TRACES_INITIALIZED
    if _TRACES_INITIALIZED:
        _instrument_fastapi(app)
        return
    from opentelemetry import trace
    from opentelemetry.sdk.trace import TracerProvider
    from opentelemetry.sdk.trace.export import BatchSpanProcessor
    from opentelemetry.sdk.trace.sampling import ALWAYS_ON, ParentBased
    provider = TracerProvider(
        resource=_build_resource(),
        sampler=ParentBased(ALWAYS_ON),
    )
    provider.add_span_processor(BatchSpanProcessor(_trace_exporter()))
    try:
        trace.set_tracer_provider(provider)
    except Exception:
        logger.warning(
            "OpenTelemetry tracer provider was already set; reusing existing provider",
            exc_info=True,
        )
        _TRACER_PROVIDER = trace.get_tracer_provider()
    else:
        _TRACER_PROVIDER = provider
    _TRACES_INITIALIZED = True
    otel.reload_for_tests()
    _instrument_libraries(app)
 def init_metrics() -> None:
    """Install the meter provider, metric reader, exporter, and custom gauges."""
    global _METER_PROVIDER, _METRICS_INITIALIZED
    if _METRICS_INITIALIZED:
        return
    from opentelemetry import metrics
    from opentelemetry.sdk.metrics import MeterProvider
    from opentelemetry.sdk.metrics.export import PeriodicExportingMetricReader
    interval_ms = int(os.environ.get("OTEL_METRIC_EXPORT_INTERVAL", "60000"))
    reader = PeriodicExportingMetricReader(
        _metric_exporter(),
        export_interval_millis=interval_ms,
    )
    provider = MeterProvider(metric_readers=[reader], resource=_build_resource())
    try:
        metrics.set_meter_provider(provider)
    except Exception:
        logger.warning(
            "OpenTelemetry meter provider was already set; reusing existing provider",
            exc_info=True,
        )
        _METER_PROVIDER = metrics.get_meter_provider()
    else:
        _METER_PROVIDER = provider
    _METRICS_INITIALIZED = True
    from app.observability.metrics import register_runtime_observables
    register_runtime_observables()
 def init_logs() -> None:
    """Enable trace/span correlation fields on stdlib LogRecords."""
    global _LOGS_INITIALIZED
    if _LOGS_INITIALIZED:
        return
    def _run() -> None:
        from opentelemetry.instrumentation.logging import LoggingInstrumentor
        # Required for stdlib LogRecords to receive otelTraceID/otelSpanID.
        # logging.basicConfig is already installed by main.py, so this does not
        # take over formatting in normal app startup.
        LoggingInstrumentor().instrument(set_logging_format=True)
    if _safe_instrument("logging", _run):
        _LOGS_INITIALIZED = True
 def init_otel(
    app: Any | None = None,
    *,
    traces: bool = True,
    metrics: bool = True,
    logs: bool = True,
 ) -> None:
    """Initialize OpenTelemetry for a FastAPI or Celery process."""
    if is_otel_disabled() or not is_otel_configured():
        otel.reload_for_tests()
        return
    if traces:
        init_traces(app)
    if metrics:
        init_metrics()
    if logs:
        init_logs()
 def shutdown_otel(timeout_millis: int = 5000) -> None:
    """Best-effort flush and shutdown for installed providers."""
    for provider in (_TRACER_PROVIDER, _METER_PROVIDER):
        if provider is None:
            continue
        with contextlib.suppress(Exception):
            provider.force_flush(timeout_millis=timeout_millis)
        with contextlib.suppress(Exception):
            provider.shutdown()
 __all__ = [
    "_BOOL_TRUE",
    "_build_resource",
    "init_logs",
    "init_metrics",
    "init_otel",
    "init_traces",
    "instrument_celery",
    "instrument_sqlalchemy_engine",
    "is_otel_configured",
    "is_otel_disabled",
    "shutdown_otel",
 ]
--- a/surfsense_backend/app/observability/metrics.py
+++ b/surfsense_backend/app/observability/metrics.py
@ -0,0 +1,684 @@
 """Custom OpenTelemetry metrics for SurfSense.
 This module owns all SurfSense-specific metric instruments. Callers use the
 small helper functions below instead of constructing instruments directly so
 attribute names and cardinality stay consistent across the backend.
 """
 from __future__ import annotations
 import contextlib
 import gc
 import logging
 from functools import lru_cache
 from importlib import metadata
 from typing import Any
 from app.observability import otel
 logger = logging.getLogger(__name__)
 _INSTRUMENTATION_NAME = "surfsense.platform"
 _OBSERVABLES_REGISTERED = False
 _ERROR_CATEGORY_UNKNOWN = "unknown"
 _ERROR_CATEGORY_HINTS: tuple[tuple[str, tuple[str, ...]], ...] = (
    ("rate_limited", ("ratelimit", "rate_limit", "toomanyrequests", "429")),
    ("auth_failed", ("authentication", "auth", "unauthorized", "forbidden")),
    ("quota_exhausted", ("quota", "insufficient", "credit", "billing")),
    ("timeout", ("timeout", "timedout", "deadline")),
    ("network_failed", ("connection", "connect", "network", "dns", "socket")),
    ("server_error", ("internalserver", "serviceunavailable", "badgateway", "gateway")),
    ("lock_contention", ("lock", "busy", "contention", "alreadyrunning")),
    ("unsupported_format", ("unsupported", "format", "filetype")),
    ("provider_error", ("provider", "apierror", "apistatus", "badrequest")),
 )
 def _package_version() -> str:
    with contextlib.suppress(metadata.PackageNotFoundError):
        return metadata.version("surf-new-backend")
    return "unknown"
 def _is_enabled() -> bool:
    return otel.is_enabled()
 def _clean_attrs(attrs: dict[str, Any]) -> dict[str, str | int | float | bool]:
    """Drop empty values and coerce low-cardinality attrs to OTel-safe scalars."""
    cleaned: dict[str, str | int | float | bool] = {}
    for key, value in attrs.items():
        if value is None:
            continue
        if isinstance(value, bool | int | float):
            cleaned[key] = value
            continue
        text = str(value)
        if text:
            cleaned[key] = text
    return cleaned
 def _attrs_with_optional_error_category(
    attrs: dict[str, Any], error_category: str | None
 ) -> dict[str, Any]:
    if error_category:
        return {**attrs, "error.category": error_category}
    return attrs
 def categorize_exception(exc: BaseException | None) -> str:
    """Return a low-cardinality category for an exception."""
    if exc is None:
        return _ERROR_CATEGORY_UNKNOWN
    haystack = " ".join(
        cls.__name__.replace("-", "").replace("_", "").lower()
        for cls in type(exc).__mro__
    )
    for category, hints in _ERROR_CATEGORY_HINTS:
        if any(hint in haystack for hint in hints):
            return category
    return _ERROR_CATEGORY_UNKNOWN
 def parse_celery_task_label(task_name: str | None) -> str:
    """Return the operation token from a Celery task name."""
    if not task_name:
        return "unknown"
    operation = str(task_name).split("_", 1)[0].strip()
    return operation or "unknown"
 def _record(callable_obj: Any, value: int | float, attrs: dict[str, Any]) -> None:
    if not _is_enabled():
        return
    with contextlib.suppress(Exception):
        callable_obj.record(value, _clean_attrs(attrs))
 def _add(callable_obj: Any, value: int, attrs: dict[str, Any]) -> None:
    if not _is_enabled():
        return
    with contextlib.suppress(Exception):
        callable_obj.add(value, _clean_attrs(attrs))
@lru_cache(maxsize=1)
 def _get_meter():
    from opentelemetry import metrics
    return metrics.get_meter(_INSTRUMENTATION_NAME, _package_version())
@lru_cache(maxsize=1)
 def _model_call_duration():
    return _get_meter().create_histogram(
        "surfsense.model.call.duration",
        unit="ms",
        description="Duration of SurfSense LLM model calls.",
    )
@lru_cache(maxsize=1)
 def _model_token_usage():
    return _get_meter().create_histogram(
        "gen_ai.client.token.usage",
        unit="{token}",
        description="Token usage reported by GenAI model responses.",
    )
@lru_cache(maxsize=1)
 def _tool_call_duration():
    return _get_meter().create_histogram(
        "surfsense.tool.call.duration",
        unit="ms",
        description="Duration of SurfSense agent tool calls.",
    )
@lru_cache(maxsize=1)
 def _tool_call_errors():
    return _get_meter().create_counter(
        "surfsense.tool.call.errors",
        description="Count of SurfSense agent tool call errors.",
    )
@lru_cache(maxsize=1)
 def _kb_search_duration():
    return _get_meter().create_histogram(
        "surfsense.kb.search.duration",
        unit="ms",
        description="Duration of SurfSense knowledge-base search calls.",
    )
@lru_cache(maxsize=1)
 def _compaction_runs():
    return _get_meter().create_counter(
        "surfsense.compaction.runs",
        description="Count of SurfSense conversation compaction runs.",
    )
@lru_cache(maxsize=1)
 def _permission_asks():
    return _get_meter().create_counter(
        "surfsense.permission.asks",
        description="Count of SurfSense permission asks.",
    )
@lru_cache(maxsize=1)
 def _interrupts():
    return _get_meter().create_counter(
        "surfsense.interrupt.raised",
        description="Count of SurfSense interrupts raised.",
    )
@lru_cache(maxsize=1)
 def _indexing_document_duration():
    return _get_meter().create_histogram(
        "surfsense.indexing.document.duration",
        unit="s",
        description="Duration of SurfSense document indexing.",
    )
@lru_cache(maxsize=1)
 def _indexing_document_outcome():
    return _get_meter().create_counter(
        "surfsense.indexing.document.outcome",
        description="Count of SurfSense document indexing outcomes.",
    )
@lru_cache(maxsize=1)
 def _connector_sync_duration():
    return _get_meter().create_histogram(
        "surfsense.connector.sync.duration",
        unit="s",
        description="Duration of SurfSense connector sync tasks.",
    )
@lru_cache(maxsize=1)
 def _connector_sync_outcome():
    return _get_meter().create_counter(
        "surfsense.connector.sync.outcome",
        description="Count of SurfSense connector sync outcomes.",
    )
@lru_cache(maxsize=1)
 def _auth_failures():
    return _get_meter().create_counter(
        "surfsense.auth.failures",
        description="Count of SurfSense authentication failures.",
    )
@lru_cache(maxsize=1)
 def _rate_limit_rejections():
    return _get_meter().create_counter(
        "surfsense.rate_limit.rejections",
        description="Count of SurfSense rate-limit rejections.",
    )
@lru_cache(maxsize=1)
 def _perf_elapsed():
    return _get_meter().create_histogram(
        "surfsense.perf.elapsed_ms",
        unit="ms",
        description="Elapsed time recorded by SurfSense perf timers.",
    )
@lru_cache(maxsize=1)
 def _chat_request_duration():
    return _get_meter().create_histogram(
        "surfsense.chat.request.duration",
        unit="ms",
        description="Duration of SurfSense streamed chat requests.",
    )
@lru_cache(maxsize=1)
 def _chat_request_outcome():
    return _get_meter().create_counter(
        "surfsense.chat.request.outcome",
        description="Count of SurfSense chat request outcomes.",
    )
@lru_cache(maxsize=1)
 def _subagent_invoke_duration():
    return _get_meter().create_histogram(
        "surfsense.subagent.invoke.duration",
        unit="ms",
        description="Duration of SurfSense subagent invocations.",
    )
@lru_cache(maxsize=1)
 def _subagent_invoke_outcome():
    return _get_meter().create_counter(
        "surfsense.subagent.invoke.outcome",
        description="Count of SurfSense subagent invocation outcomes.",
    )
@lru_cache(maxsize=1)
 def _etl_extract_duration():
    return _get_meter().create_histogram(
        "surfsense.etl.extract.duration",
        unit="s",
        description="Duration of SurfSense ETL extraction.",
    )
@lru_cache(maxsize=1)
 def _etl_extract_outcome():
    return _get_meter().create_counter(
        "surfsense.etl.extract.outcome",
        description="Count of SurfSense ETL extraction outcomes.",
    )
@lru_cache(maxsize=1)
 def _celery_heartbeat_refreshes():
    return _get_meter().create_counter(
        "surfsense.celery.heartbeat.refreshes",
        description="Count of SurfSense Celery heartbeat refreshes.",
    )
@lru_cache(maxsize=1)
 def _celery_heartbeat_failures():
    return _get_meter().create_counter(
        "surfsense.celery.heartbeat.failures",
        description="Count of SurfSense Celery heartbeat failures.",
    )
@lru_cache(maxsize=1)
 def _celery_queue_latency():
    return _get_meter().create_histogram(
        "surfsense.celery.queue.latency",
        unit="s",
        description="Time SurfSense Celery tasks spend waiting in queue.",
    )
 def record_model_call_duration(
    duration_ms: float, *, model: str | None, provider: str | None
 ) -> None:
    _record(
        _model_call_duration(),
        duration_ms,
        {
            "gen_ai.request.model": model,
            "gen_ai.provider.name": provider,
        },
    )
 def record_model_token_usage(
    *,
    input_tokens: int | None,
    output_tokens: int | None,
    model: str | None,
    provider: str | None,
 ) -> None:
    base = {
        "gen_ai.request.model": model,
        "gen_ai.provider.name": provider,
        "gen_ai.operation.name": "chat",
    }
    if input_tokens is not None:
        _record(
            _model_token_usage(),
            int(input_tokens),
            {**base, "gen_ai.token.type": "input"},
        )
    if output_tokens is not None:
        _record(
            _model_token_usage(),
            int(output_tokens),
            {**base, "gen_ai.token.type": "output"},
        )
 def record_tool_call_duration(duration_ms: float, *, tool_name: str) -> None:
    _record(_tool_call_duration(), duration_ms, {"tool.name": tool_name})
 def record_tool_call_error(*, tool_name: str) -> None:
    _add(_tool_call_errors(), 1, {"tool.name": tool_name})
 def record_kb_search_duration(
    duration_ms: float, *, search_space_id: int | None, surface: str
 ) -> None:
    _record(
        _kb_search_duration(),
        duration_ms,
        {"search_space.id": search_space_id, "search.surface": surface},
    )
 def record_compaction_run(*, reason: str | None) -> None:
    _add(_compaction_runs(), 1, {"compaction.reason": reason or "unknown"})
 def record_permission_ask(*, permission: str) -> None:
    _add(_permission_asks(), 1, {"permission.permission": permission})
 def record_interrupt(*, interrupt_type: str) -> None:
    _add(_interrupts(), 1, {"interrupt.type": interrupt_type})
 def record_indexing_document_duration(
    duration_s: float, *, document_type: str | None
 ) -> None:
    _record(
        _indexing_document_duration(),
        duration_s,
        {"document.type": document_type or "unknown"},
    )
 def record_indexing_document_outcome(*, document_type: str | None, status: str) -> None:
    _add(
        _indexing_document_outcome(),
        1,
        {"document.type": document_type or "unknown", "status": status},
    )
 def record_connector_sync_duration(
    duration_s: float, *, connector_type: str | None
 ) -> None:
    _record(
        _connector_sync_duration(),
        duration_s,
        {"connector.type": connector_type or "unknown"},
    )
 def record_connector_sync_outcome(
    *, connector_type: str | None, status: str, error_category: str | None = None
 ) -> None:
    _add(
        _connector_sync_outcome(),
        1,
        _attrs_with_optional_error_category(
            {"connector.type": connector_type or "unknown", "status": status},
            error_category,
        ),
    )
 def record_auth_failure(*, reason: str) -> None:
    _add(_auth_failures(), 1, {"reason": reason})
 def record_rate_limit_rejection(*, scope: str) -> None:
    _add(_rate_limit_rejections(), 1, {"scope": scope})
 def record_perf_elapsed(duration_ms: float, *, label: str) -> None:
    _record(_perf_elapsed(), duration_ms, {"label": label})
 def record_chat_request_duration(
    duration_ms: float,
    *,
    flow: str,
    outcome: str,
    agent_mode: str | None = None,
 ) -> None:
    _record(
        _chat_request_duration(),
        duration_ms,
        {"chat.flow": flow, "outcome": outcome, "agent.mode": agent_mode},
    )
 def record_chat_request_outcome(
    *,
    flow: str,
    outcome: str,
    agent_mode: str | None = None,
    error_category: str | None = None,
 ) -> None:
    _add(
        _chat_request_outcome(),
        1,
        _attrs_with_optional_error_category(
            {"chat.flow": flow, "outcome": outcome, "agent.mode": agent_mode},
            error_category,
        ),
    )
 def record_subagent_invoke_duration(
    duration_ms: float,
    *,
    subagent_type: str,
    path: str | None,
    outcome: str,
 ) -> None:
    _record(
        _subagent_invoke_duration(),
        duration_ms,
        {
            "subagent.type": subagent_type,
            "subagent.path": path or "unknown",
            "outcome": outcome,
        },
    )
 def record_subagent_invoke_outcome(
    *,
    subagent_type: str,
    path: str | None,
    outcome: str,
 ) -> None:
    _add(
        _subagent_invoke_outcome(),
        1,
        {
            "subagent.type": subagent_type,
            "subagent.path": path or "unknown",
            "outcome": outcome,
        },
    )
 def record_etl_extract_duration(
    duration_s: float,
    *,
    etl_service: str | None,
    content_type: str | None,
    status: str,
 ) -> None:
    _record(
        _etl_extract_duration(),
        duration_s,
        {
            "etl.service": etl_service or "unknown",
            "content.type": content_type or "unknown",
            "status": status,
        },
    )
 def record_etl_extract_outcome(
    *,
    etl_service: str | None,
    content_type: str | None,
    status: str,
    error_category: str | None = None,
 ) -> None:
    _add(
        _etl_extract_outcome(),
        1,
        _attrs_with_optional_error_category(
            {
                "etl.service": etl_service or "unknown",
                "content.type": content_type or "unknown",
                "status": status,
            },
            error_category,
        ),
    )
 def record_celery_heartbeat_refresh(*, heartbeat_type: str) -> None:
    _add(_celery_heartbeat_refreshes(), 1, {"heartbeat.type": heartbeat_type})
 def record_celery_heartbeat_failure(*, heartbeat_type: str) -> None:
    _add(_celery_heartbeat_failures(), 1, {"heartbeat.type": heartbeat_type})
 def record_celery_queue_latency(
    duration_s: float,
    *,
    task_name: str | None,
    queue: str | None,
    scheduled: bool,
    operation: str | None,
 ) -> None:
    _record(
        _celery_queue_latency(),
        duration_s,
        {
            "task.name": task_name or "unknown",
            "task.queue": queue or "unknown",
            "task.scheduled": bool(scheduled),
            "operation": operation or "unknown",
        },
    )
 def _runtime_snapshot_value(key: str, transform: Any = None) -> list[Any]:
    from opentelemetry.metrics import Observation
    from app.utils.perf import system_snapshot
    snap = system_snapshot()
    value = snap.get(key)
    if not isinstance(value, int | float) or value < 0:
        return []
    if transform is not None:
        value = transform(value)
    return [Observation(value)]
 def _observe_gc_collections(_options: Any) -> list[Any]:
    from opentelemetry.metrics import Observation
    return [
        Observation(count, {"generation": str(generation)})
        for generation, count in enumerate(gc.get_count())
    ]
 def register_runtime_observables() -> None:
    """Register process/runtime observable gauges once per process."""
    global _OBSERVABLES_REGISTERED
    if _OBSERVABLES_REGISTERED or not _is_enabled():
        return
    meter = _get_meter()
    try:
        # Each callback returns the value for a single gauge except GC, whose
        # callback carries a generation attribute.
        meter.create_observable_gauge(
            "process.runtime.cpython.memory.rss",
            callbacks=[
                lambda _options: _runtime_snapshot_value(
                    "rss_mb", lambda v: float(v) * 1024 * 1024
                )
            ],
            unit="By",
            description="Resident set size of the SurfSense backend process.",
        )
        meter.create_observable_gauge(
            "process.runtime.cpython.cpu.utilization",
            callbacks=[
                lambda _options: _runtime_snapshot_value(
                    "cpu_percent", lambda v: float(v) / 100.0
                )
            ],
            unit="1",
            description="CPU utilization of the SurfSense backend process.",
        )
        meter.create_observable_gauge(
            "process.runtime.cpython.threads",
            callbacks=[lambda _options: _runtime_snapshot_value("threads")],
            unit="{thread}",
            description="Thread count of the SurfSense backend process.",
        )
        meter.create_observable_gauge(
            "process.runtime.cpython.open_fds",
            callbacks=[lambda _options: _runtime_snapshot_value("open_fds")],
            unit="{fd}",
            description="Open file descriptor count of the SurfSense backend process.",
        )
        meter.create_observable_gauge(
            "python.asyncio.tasks",
            callbacks=[lambda _options: _runtime_snapshot_value("asyncio_tasks")],
            unit="{task}",
            description="Live asyncio task count in the current event loop.",
        )
        meter.create_observable_gauge(
            "process.runtime.cpython.gc.collections",
            callbacks=[_observe_gc_collections],
            unit="{collection}",
            description="CPython GC counters by generation.",
        )
    except Exception:
        logger.warning("Failed to register OTel runtime observables", exc_info=True)
        return
    _OBSERVABLES_REGISTERED = True
 __all__ = [
    "categorize_exception",
    "parse_celery_task_label",
    "record_auth_failure",
    "record_celery_heartbeat_failure",
    "record_celery_heartbeat_refresh",
    "record_celery_queue_latency",
    "record_chat_request_duration",
    "record_chat_request_outcome",
    "record_compaction_run",
    "record_connector_sync_duration",
    "record_connector_sync_outcome",
    "record_etl_extract_duration",
    "record_etl_extract_outcome",
    "record_indexing_document_duration",
    "record_indexing_document_outcome",
    "record_interrupt",
    "record_kb_search_duration",
    "record_model_call_duration",
    "record_model_token_usage",
    "record_perf_elapsed",
    "record_permission_ask",
    "record_rate_limit_rejection",
    "record_subagent_invoke_duration",
    "record_subagent_invoke_outcome",
    "record_tool_call_duration",
    "record_tool_call_error",
    "register_runtime_observables",
 ]
--- a/surfsense_backend/app/observability/otel.py
+++ b/surfsense_backend/app/observability/otel.py
@ -66,6 +66,8 @@ def _resolve_enabled() -> bool:
    # Honor an explicit kill-switch first.
    if os.environ.get("SURFSENSE_DISABLE_OTEL", "").lower() in {"1", "true", "yes"}:
        return False
    if os.environ.get("OTEL_SDK_DISABLED", "").lower() in {"1", "true", "yes", "on"}:
        return False
    # Treat a configured endpoint as the canonical "OTel is wired up" signal.
    if os.environ.get("OTEL_EXPORTER_OTLP_ENDPOINT"):
        return True
@ -90,6 +92,48 @@ def is_enabled() -> bool:
    return _ENABLED
 def _clean_event_attrs(attrs: dict[str, Any]) -> dict[str, str | int | float | bool]:
    """Coerce event attributes to OTel-safe scalar values."""
    cleaned: dict[str, str | int | float | bool] = {}
    for key, value in attrs.items():
        if value is None:
            continue
        if isinstance(value, bool | int | float):
            cleaned[key] = value
            continue
        text = str(value)
        if text:
            cleaned[key] = text
    return cleaned
 def add_event(name: str, attributes: dict[str, Any] | None = None) -> None:
    """Attach an event to the current active span.
    This is intentionally no-op and exception-safe when OTel is disabled,
    unavailable, or no span is currently recording.
    """
    if not _ENABLED or _ot_trace is None:
        return
    with contextlib.suppress(Exception):
        sp = _ot_trace.get_current_span()
        if sp is None or not sp.is_recording():
            return
        sp.add_event(
            name,
            attributes=_clean_event_attrs(attributes) if attributes else None,
        )
 def record_error(span_obj: Any, exc: BaseException) -> None:
    """Record an exception and mark a span as errored without re-raising."""
    if not _ENABLED:
        return
    with contextlib.suppress(Exception):
        span_obj.record_exception(exc)
        span_obj.set_status(_OtStatus(_OtStatusCode.ERROR, str(exc)))
 def _get_tracer():
    if not _OTEL_AVAILABLE:
        return None
@ -198,8 +242,11 @@ def model_call_span(
    attrs: dict[str, Any] = {}
    if model_id:
        attrs["model.id"] = model_id
        attrs["gen_ai.request.model"] = model_id
    if provider:
        attrs["model.provider"] = provider
        attrs["gen_ai.provider.name"] = provider
    attrs["gen_ai.operation.name"] = "chat"
    if extra:
        attrs.update(extra)
    return span("model.call", attributes=attrs)
@ -239,6 +286,152 @@ def kb_persist_span(
    return span("kb.persist", attributes=attrs)
 def chat_request_span(
    *,
    chat_id: int | None = None,
    search_space_id: int | None = None,
    flow: str | None = None,
    request_id: str | None = None,
    turn_id: str | None = None,
    filesystem_mode: str | None = None,
    client_platform: str | None = None,
    agent_mode: str | None = None,
    extra: dict[str, Any] | None = None,
 ):
    """Parent span for a single streamed chat or resume turn."""
    attrs: dict[str, Any] = {}
    if chat_id is not None:
        attrs["chat.id"] = int(chat_id)
    if search_space_id is not None:
        attrs["search_space.id"] = int(search_space_id)
    if flow:
        attrs["chat.flow"] = flow
    if request_id:
        attrs["request.id"] = request_id
    if turn_id:
        attrs["turn.id"] = turn_id
    if filesystem_mode:
        attrs["filesystem.mode"] = filesystem_mode
    if client_platform:
        attrs["client.platform"] = client_platform
    if agent_mode:
        attrs["agent.mode"] = agent_mode
    if extra:
        attrs.update(extra)
    return span("chat.request", attributes=attrs)
 def subagent_invoke_span(
    *,
    subagent_type: str,
    path: str | None = None,
    extra: dict[str, Any] | None = None,
 ):
    """Span around invoking a delegated subagent from the main agent."""
    attrs: dict[str, Any] = {"subagent.type": subagent_type}
    if path:
        attrs["subagent.path"] = path
    if extra:
        attrs.update(extra)
    return span("subagent.invoke", attributes=attrs)
 def connector_sync_span(
    *,
    connector_type: str | None,
    extra: dict[str, Any] | None = None,
 ):
    """Business-level span around connector indexing task execution."""
    attrs: dict[str, Any] = {"connector.type": connector_type or "unknown"}
    if extra:
        attrs.update(extra)
    return span("connector.sync", attributes=attrs)
 def etl_extract_span(
    *,
    content_type: str | None = None,
    file_extension: str | None = None,
    processing_mode: str | None = None,
    extra: dict[str, Any] | None = None,
 ):
    """Span around top-level ETL extraction for a file."""
    attrs: dict[str, Any] = {}
    if content_type:
        attrs["content.type"] = content_type
    if file_extension:
        attrs["file.extension"] = file_extension
    if processing_mode:
        attrs["processing.mode"] = processing_mode
    if extra:
        attrs.update(extra)
    return span("etl.extract", attributes=attrs)
 def etl_parse_span(
    *,
    etl_service: str | None,
    content_type: str | None = None,
    file_extension: str | None = None,
    processing_mode: str | None = None,
    extra: dict[str, Any] | None = None,
 ):
    """Span around a concrete ETL parser/backend call."""
    attrs: dict[str, Any] = {"etl.service": etl_service or "unknown"}
    if content_type:
        attrs["content.type"] = content_type
    if file_extension:
        attrs["file.extension"] = file_extension
    if processing_mode:
        attrs["processing.mode"] = processing_mode
    if extra:
        attrs.update(extra)
    return span("etl.parse", attributes=attrs)
 def etl_ocr_span(
    *,
    etl_service: str | None,
    file_extension: str | None = None,
    extra: dict[str, Any] | None = None,
 ):
    """Span around OCR extraction from image content."""
    attrs: dict[str, Any] = {"etl.service": etl_service or "unknown"}
    if file_extension:
        attrs["file.extension"] = file_extension
    if extra:
        attrs.update(extra)
    return span("etl.ocr", attributes=attrs)
 def etl_picture_describe_span(
    *,
    image_count: int | None = None,
    extra: dict[str, Any] | None = None,
 ):
    """Span around describing embedded images in a document."""
    attrs: dict[str, Any] = {}
    if image_count is not None:
        attrs["image.count"] = int(image_count)
    if extra:
        attrs.update(extra)
    return span("etl.picture.describe", attributes=attrs)
 def etl_picture_ocr_span(
    *,
    file_extension: str | None = None,
    extra: dict[str, Any] | None = None,
 ):
    """Span around per-image OCR during picture description."""
    attrs: dict[str, Any] = {}
    if file_extension:
        attrs["file.extension"] = file_extension
    if extra:
        attrs.update(extra)
    return span("etl.picture.ocr", attributes=attrs)
 def compaction_span(
    *,
    reason: str | None = None,
@ -301,14 +494,24 @@ def reload_for_tests() -> bool:
 __all__ = [
    "add_event",
    "chat_request_span",
    "compaction_span",
    "connector_sync_span",
    "etl_extract_span",
    "etl_ocr_span",
    "etl_parse_span",
    "etl_picture_describe_span",
    "etl_picture_ocr_span",
    "interrupt_span",
    "is_enabled",
    "kb_persist_span",
    "kb_search_span",
    "model_call_span",
    "permission_asked_span",
    "record_error",
    "reload_for_tests",
    "span",
    "subagent_invoke_span",
    "tool_call_span",
 ]
--- a/surfsense_backend/app/retriever/chunks_hybrid_search.py
+++ b/surfsense_backend/app/retriever/chunks_hybrid_search.py
@ -1,13 +1,51 @@
 import asyncio
 import contextlib
 import functools
 import time
 from datetime import datetime
 from app.observability import metrics as ot_metrics, otel as ot
 from app.utils.perf import get_perf_logger
 _MAX_FETCH_CHUNKS_PER_DOC = 20
 def _instrument_search(mode: str):
    def _decorator(func):
        @functools.wraps(func)
        async def _wrapper(
            self, query_text: str, top_k: int, search_space_id: int, *args, **kwargs
        ):
            t0 = time.perf_counter()
            with ot.kb_search_span(
                search_space_id=search_space_id,
                query_chars=len(query_text),
                extra={"search.surface": "chunks", "search.mode": mode},
            ) as sp:
                try:
                    result = await func(
                        self, query_text, top_k, search_space_id, *args, **kwargs
                    )
                except Exception:
                    ot_metrics.record_kb_search_duration(
                        (time.perf_counter() - t0) * 1000,
                        search_space_id=search_space_id,
                        surface="chunks",
                    )
                    raise
                sp.set_attribute("result.count", len(result))
                ot_metrics.record_kb_search_duration(
                    (time.perf_counter() - t0) * 1000,
                    search_space_id=search_space_id,
                    surface="chunks",
                )
                return result
        return _wrapper
    return _decorator
 class ChucksHybridSearchRetriever:
    def __init__(self, db_session):
        """
@ -18,6 +56,7 @@ class ChucksHybridSearchRetriever:
        """
        self.db_session = db_session
    @_instrument_search("vector")
    async def vector_search(
        self,
        query_text: str,
@ -88,6 +127,7 @@ class ChucksHybridSearchRetriever:
        return chunks
    @_instrument_search("full_text")
    async def full_text_search(
        self,
        query_text: str,
@ -153,6 +193,7 @@ class ChucksHybridSearchRetriever:
        return chunks
    @_instrument_search("hybrid")
    async def hybrid_search(
        self,
        query_text: str,
--- a/surfsense_backend/app/retriever/documents_hybrid_search.py
+++ b/surfsense_backend/app/retriever/documents_hybrid_search.py
@ -1,12 +1,50 @@
 import contextlib
 import functools
 import time
 from datetime import datetime
 from app.observability import metrics as ot_metrics, otel as ot
 from app.utils.perf import get_perf_logger
 _MAX_FETCH_CHUNKS_PER_DOC = 20
 def _instrument_search(mode: str):
    def _decorator(func):
        @functools.wraps(func)
        async def _wrapper(
            self, query_text: str, top_k: int, search_space_id: int, *args, **kwargs
        ):
            t0 = time.perf_counter()
            with ot.kb_search_span(
                search_space_id=search_space_id,
                query_chars=len(query_text),
                extra={"search.surface": "documents", "search.mode": mode},
            ) as sp:
                try:
                    result = await func(
                        self, query_text, top_k, search_space_id, *args, **kwargs
                    )
                except Exception:
                    ot_metrics.record_kb_search_duration(
                        (time.perf_counter() - t0) * 1000,
                        search_space_id=search_space_id,
                        surface="documents",
                    )
                    raise
                sp.set_attribute("result.count", len(result))
                ot_metrics.record_kb_search_duration(
                    (time.perf_counter() - t0) * 1000,
                    search_space_id=search_space_id,
                    surface="documents",
                )
                return result
        return _wrapper
    return _decorator
 class DocumentHybridSearchRetriever:
    def __init__(self, db_session):
        """
@ -17,6 +55,7 @@ class DocumentHybridSearchRetriever:
        """
        self.db_session = db_session
    @_instrument_search("vector")
    async def vector_search(
        self,
        query_text: str,
@ -81,6 +120,7 @@ class DocumentHybridSearchRetriever:
        return documents
    @_instrument_search("full_text")
    async def full_text_search(
        self,
        query_text: str,
@ -145,6 +185,7 @@ class DocumentHybridSearchRetriever:
        return documents
    @_instrument_search("hybrid")
    async def hybrid_search(
        self,
        query_text: str,
--- a/surfsense_backend/app/routes/search_source_connectors_routes.py
+++ b/surfsense_backend/app/routes/search_source_connectors_routes.py
@ -43,6 +43,7 @@ from app.db import (
    async_session_maker,
    get_async_session,
 )
 from app.observability import metrics as ot_metrics, otel as ot
 from app.schemas import (
    GoogleDriveIndexRequest,
    MCPConnectorCreate,
@ -104,7 +105,9 @@ async def _run_indexing_heartbeat_loop(notification_id: int) -> None:
            await asyncio.sleep(HEARTBEAT_REFRESH_INTERVAL)
            try:
                get_heartbeat_redis_client().setex(key, HEARTBEAT_TTL_SECONDS, "alive")
                ot_metrics.record_celery_heartbeat_refresh(heartbeat_type="connector")
            except Exception as e:
                ot_metrics.record_celery_heartbeat_failure(heartbeat_type="connector")
                logger.warning(
                    f"Failed to refresh Redis heartbeat for notification "
                    f"{notification_id}: {e}"
@ -1243,6 +1246,12 @@ async def _persist_auth_expired(session: AsyncSession, connector_id: int) -> Non
    """Flag a connector as auth_expired so the frontend shows a re-auth prompt."""
    from sqlalchemy.orm.attributes import flag_modified
    ot.add_event(
        "connector.auth.expired",
        {
            "error.category": "auth_failed",
        },
    )
    try:
        result = await session.execute(
            select(SearchSourceConnector).where(
@ -1302,6 +1311,13 @@ async def _run_indexing_with_notifications(
    try:
        connector_lock_acquired = acquire_connector_indexing_lock(connector_id)
        if not connector_lock_acquired:
            ot.add_event(
                "connector.sync.skipped",
                {
                    "skip.reason": "lock_contention",
                    "error.category": "lock_contention",
                },
            )
            logger.info(
                f"Skipping indexing for connector {connector_id} "
                "(another worker already holds Redis connector lock)"
@ -1338,7 +1354,13 @@ async def _run_indexing_with_notifications(
                    get_heartbeat_redis_client().setex(
                        heartbeat_key, HEARTBEAT_TTL_SECONDS, "0"
                    )
                    ot_metrics.record_celery_heartbeat_refresh(
                        heartbeat_type="connector"
                    )
                except Exception as e:
                    ot_metrics.record_celery_heartbeat_failure(
                        heartbeat_type="connector"
                    )
                    logger.warning(f"Failed to set initial Redis heartbeat: {e}")
                # Start a background coroutine that refreshes the
@ -1366,6 +1388,15 @@ async def _run_indexing_with_notifications(
        ) -> None:
            """Callback to update notification during API retries (rate limits, etc.)"""
            nonlocal notification
            ot.add_event(
                "connector.retry.scheduled",
                {
                    "retry.reason": retry_reason,
                    "retry.attempt": attempt,
                    "retry.max": max_attempts,
                    "retry.delay_ms": int(wait_seconds * 1000),
                },
            )
            if notification:
                try:
                    await session.refresh(notification)
@ -1397,8 +1428,14 @@ async def _run_indexing_with_notifications(
                    get_heartbeat_redis_client().setex(
                        heartbeat_key, HEARTBEAT_TTL_SECONDS, str(indexed_count)
                    )
                    ot_metrics.record_celery_heartbeat_refresh(
                        heartbeat_type="connector"
                    )
                except Exception as e:
                    # Don't let Redis errors break the indexing
                    ot_metrics.record_celery_heartbeat_failure(
                        heartbeat_type="connector"
                    )
                    logger.warning(f"Failed to set Redis heartbeat: {e}")
                try:
--- a/surfsense_backend/app/tasks/celery_tasks/init.py
+++ b/surfsense_backend/app/tasks/celery_tasks/init.py
@ -37,6 +37,10 @@ def get_celery_session_maker() -> async_sessionmaker:
            poolclass=NullPool,
            echo=False,
        )
        with contextlib.suppress(Exception):
            from app.observability.bootstrap import instrument_sqlalchemy_engine
            instrument_sqlalchemy_engine(_celery_engine)
        _celery_session_maker = async_sessionmaker(
            _celery_engine, expire_on_commit=False
        )
--- a/surfsense_backend/app/tasks/celery_tasks/connector_tasks.py
+++ b/surfsense_backend/app/tasks/celery_tasks/connector_tasks.py
@ -1,14 +1,52 @@
 """Celery tasks for connector indexing."""
 import logging
 import time
 import traceback
 from collections.abc import Awaitable, Callable
 from celery import current_task
 from app.celery_app import celery_app
-from app.tasks.celery_tasks import get_celery_session_maker, run_async_celery_task
+from app.observability import metrics as ot_metrics, otel as ot
 from app.tasks.celery_tasks import (
    get_celery_session_maker,
    run_async_celery_task as _run_async_celery_task,
 )
 logger = logging.getLogger(__name__)
 def run_async_celery_task[T](coro_factory: Callable[[], Awaitable[T]]) -> T:
    """Run connector sync work and record aggregate connector metrics."""
    task_name = getattr(current_task, "name", None) or "unknown"
    t0 = time.perf_counter()
    status = "failed"
    error_category: str | None = None
    try:
        with ot.connector_sync_span(connector_type=task_name) as sp:
            try:
                result = _run_async_celery_task(coro_factory)
                sp.set_attribute("connector.status", "success")
            except Exception as exc:
                error_category = ot_metrics.categorize_exception(exc)
                sp.set_attribute("connector.error.category", error_category)
                raise
        status = "success"
        return result
    finally:
        elapsed_s = time.perf_counter() - t0
        ot_metrics.record_connector_sync_duration(
            elapsed_s,
            connector_type=task_name,
        )
        ot_metrics.record_connector_sync_outcome(
            connector_type=task_name,
            status=status,
            error_category=error_category,
        )
 def _handle_greenlet_error(e: Exception, task_name: str, connector_id: int) -> None:
    """
    Handle greenlet_spawn errors with detailed logging for debugging.
--- a/surfsense_backend/app/tasks/celery_tasks/document_tasks.py
+++ b/surfsense_backend/app/tasks/celery_tasks/document_tasks.py
@ -9,6 +9,7 @@ from uuid import UUID
 from app.celery_app import celery_app
 from app.config import config
 from app.observability import metrics as ot_metrics
 from app.services.notification_service import NotificationService
 from app.services.task_logging_service import TaskLoggingService
 from app.tasks.celery_tasks import get_celery_session_maker, run_async_celery_task
@ -59,7 +60,9 @@ def _start_heartbeat(notification_id: int) -> None:
    try:
        key = _get_heartbeat_key(notification_id)
        _get_doc_heartbeat_redis().setex(key, HEARTBEAT_TTL_SECONDS, "started")
        ot_metrics.record_celery_heartbeat_refresh(heartbeat_type="document")
    except Exception as e:
        ot_metrics.record_celery_heartbeat_failure(heartbeat_type="document")
        logger.warning(
            f"Failed to set initial heartbeat for notification {notification_id}: {e}"
        )
@ -87,7 +90,9 @@ async def _run_heartbeat_loop(notification_id: int):
            await asyncio.sleep(HEARTBEAT_REFRESH_INTERVAL)
            try:
                _get_doc_heartbeat_redis().setex(key, HEARTBEAT_TTL_SECONDS, "alive")
                ot_metrics.record_celery_heartbeat_refresh(heartbeat_type="document")
            except Exception as e:
                ot_metrics.record_celery_heartbeat_failure(heartbeat_type="document")
                logger.warning(
                    f"Failed to refresh heartbeat for notification {notification_id}: {e}"
                )
--- a/surfsense_backend/app/tasks/chat/stream_new_chat.py
+++ b/surfsense_backend/app/tasks/chat/stream_new_chat.py
@ -14,6 +14,7 @@ import contextlib
 import gc
 import json
 import logging
 import sys
 import time
 from collections.abc import AsyncGenerator
 from dataclasses import dataclass, field
@ -58,6 +59,7 @@ from app.db import (
    async_session_maker,
    shielded_async_session,
 )
 from app.observability import metrics as ot_metrics, otel as ot
 from app.prompts import TITLE_GENERATION_PROMPT
 from app.services.auto_model_pin_service import (
    mark_runtime_cooldown,
@ -883,6 +885,20 @@ async def stream_new_chat(
    stream_result.turn_id = f"{chat_id}:{int(time.time() * 1000)}"
    stream_result.filesystem_mode = fs_mode
    stream_result.client_platform = fs_platform
    chat_agent_mode = "unknown"
    chat_outcome = "success"
    chat_error_category: str | None = None
    chat_span_cm = ot.chat_request_span(
        chat_id=chat_id,
        search_space_id=search_space_id,
        flow=flow,
        request_id=request_id,
        turn_id=stream_result.turn_id,
        filesystem_mode=fs_mode,
        client_platform=fs_platform,
        agent_mode=chat_agent_mode,
    )
    chat_span = chat_span_cm.__enter__()
    _log_file_contract("turn_start", stream_result)
    _perf_log.info(
        "[stream_new_chat] filesystem_mode=%s client_platform=%s",
@ -971,6 +987,14 @@ async def stream_new_chat(
                    requires_image_input=_requires_image_input,
                )
            ).resolved_llm_config_id
            ot.add_event(
                "model.pin.resolved",
                {
                    "pin.requested_id": requested_llm_config_id,
                    "pin.resolved_id": llm_config_id,
                    "pin.requires_image_input": _requires_image_input,
                },
            )
        except ValueError as pin_error:
            # Auto-pin's "no vision-capable cfg" path raises a ValueError
            # whose message we map to the friendly image-input SSE error
@ -987,6 +1011,13 @@ async def stream_new_chat(
                if error_code == "MODEL_DOES_NOT_SUPPORT_IMAGE_INPUT"
                else "server_error"
            )
            if error_code == "MODEL_DOES_NOT_SUPPORT_IMAGE_INPUT":
                ot.add_event(
                    "quota.denied",
                    {
                        "quota.code": error_code,
                    },
                )
            yield _emit_stream_error(
                message=str(pin_error),
                error_kind=error_kind,
@ -1041,6 +1072,12 @@ async def stream_new_chat(
                model_label = (
                    agent_config.config_name or agent_config.model_name or "model"
                )
                ot.add_event(
                    "quota.denied",
                    {
                        "quota.code": "MODEL_DOES_NOT_SUPPORT_IMAGE_INPUT",
                    },
                )
                yield _emit_stream_error(
                    message=(
                        f"The selected model ({model_label}) does not support "
@ -1084,6 +1121,12 @@ async def stream_new_chat(
                )
            _premium_reserved_micros = reserve_amount_micros
            if not quota_result.allowed:
                ot.add_event(
                    "quota.denied",
                    {
                        "quota.code": "PREMIUM_QUOTA_EXHAUSTED",
                    },
                )
                if requested_llm_config_id == 0:
                    try:
                        llm_config_id = (
@ -1097,6 +1140,13 @@ async def stream_new_chat(
                                requires_image_input=_requires_image_input,
                            )
                        ).resolved_llm_config_id
                        ot.add_event(
                            "model.repin",
                            {
                                "repin.reason": "premium_quota_exhausted",
                                "repin.to_config_id": llm_config_id,
                            },
                        )
                    except ValueError as pin_error:
                        yield _emit_stream_error(
                            message=str(pin_error),
@ -1189,6 +1239,9 @@ async def stream_new_chat(
        from app.config import config as _app_config
        use_multi_agent = bool(_app_config.MULTI_AGENT_CHAT_ENABLED)
        chat_agent_mode = "multi" if use_multi_agent else "single"
        with contextlib.suppress(Exception):
            chat_span.set_attribute("agent.mode", chat_agent_mode)
        _t0 = time.perf_counter()
        agent_factory = (
@ -1863,6 +1916,14 @@ async def stream_new_chat(
                    llm_config_id,
                    time.perf_counter() - _t0,
                )
                ot.add_event(
                    "chat.rate_limit.recovered",
                    {
                        "recovery.reason": "provider_rate_limited",
                        "recovery.previous_config_id": previous_config_id,
                        "recovery.fallback_config_id": llm_config_id,
                    },
                )
                _log_chat_stream_error(
                    flow=flow,
                    error_kind="rate_limited",
@ -1893,6 +1954,12 @@ async def stream_new_chat(
        log_system_snapshot("stream_new_chat_END")
        if stream_result.is_interrupted:
            ot.add_event(
                "chat.interrupted",
                {
                    "chat.flow": flow,
                },
            )
            if title_task is not None and not title_task.done():
                title_task.cancel()
@ -2011,6 +2078,12 @@ async def stream_new_chat(
            user_message,
            error_extra,
        ) = _classify_stream_exception(e, flow_label="chat")
        chat_outcome = error_code or error_kind or "error"
        chat_error_category = ot_metrics.categorize_exception(e)
        with contextlib.suppress(Exception):
            chat_span.set_attribute("chat.outcome", chat_outcome)
            chat_span.set_attribute("error.category", chat_error_category)
            ot.record_error(chat_span, e)
        error_message = f"Error during chat: {e!s}"
        print(f"[stream_new_chat] {error_message}")
        print(f"[stream_new_chat] Exception type: {type(e).__name__}")
@ -2201,6 +2274,21 @@ async def stream_new_chat(
            )
        trim_native_heap()
        log_system_snapshot("stream_new_chat_END")
        with contextlib.suppress(Exception):
            chat_span.set_attribute("chat.outcome", chat_outcome)
            ot_metrics.record_chat_request_duration(
                (time.perf_counter() - _t_total) * 1000,
                flow=flow,
                outcome=chat_outcome,
                agent_mode=chat_agent_mode,
            )
            ot_metrics.record_chat_request_outcome(
                flow=flow,
                outcome=chat_outcome,
                agent_mode=chat_agent_mode,
                error_category=chat_error_category,
            )
        chat_span_cm.__exit__(*sys.exc_info())
 async def stream_resume_chat(
@ -2225,6 +2313,20 @@ async def stream_resume_chat(
    stream_result.turn_id = f"{chat_id}:{int(time.time() * 1000)}"
    stream_result.filesystem_mode = fs_mode
    stream_result.client_platform = fs_platform
    chat_agent_mode = "unknown"
    chat_outcome = "success"
    chat_error_category: str | None = None
    chat_span_cm = ot.chat_request_span(
        chat_id=chat_id,
        search_space_id=search_space_id,
        flow="resume",
        request_id=request_id,
        turn_id=stream_result.turn_id,
        filesystem_mode=fs_mode,
        client_platform=fs_platform,
        agent_mode=chat_agent_mode,
    )
    chat_span = chat_span_cm.__enter__()
    _log_file_contract("turn_start", stream_result)
    _perf_log.info(
        "[stream_resume] filesystem_mode=%s client_platform=%s",
@ -2297,6 +2399,14 @@ async def stream_resume_chat(
                    selected_llm_config_id=llm_config_id,
                )
            ).resolved_llm_config_id
            ot.add_event(
                "model.pin.resolved",
                {
                    "pin.requested_id": requested_llm_config_id,
                    "pin.resolved_id": llm_config_id,
                    "pin.requires_image_input": False,
                },
            )
        except ValueError as pin_error:
            yield _emit_stream_error(
                message=str(pin_error),
@ -2353,6 +2463,12 @@ async def stream_resume_chat(
                )
            _resume_premium_reserved_micros = reserve_amount_micros
            if not quota_result.allowed:
                ot.add_event(
                    "quota.denied",
                    {
                        "quota.code": "PREMIUM_QUOTA_EXHAUSTED",
                    },
                )
                if requested_llm_config_id == 0:
                    try:
                        llm_config_id = (
@ -2365,6 +2481,13 @@ async def stream_resume_chat(
                                force_repin_free=True,
                            )
                        ).resolved_llm_config_id
                        ot.add_event(
                            "model.repin",
                            {
                                "repin.reason": "premium_quota_exhausted",
                                "repin.to_config_id": llm_config_id,
                            },
                        )
                    except ValueError as pin_error:
                        yield _emit_stream_error(
                            message=str(pin_error),
@ -2454,6 +2577,11 @@ async def stream_resume_chat(
        visibility = thread_visibility or ChatVisibility.PRIVATE
        from app.config import config as _app_config
        chat_agent_mode = (
            "multi" if _app_config.MULTI_AGENT_CHAT_ENABLED else "single"
        )
        with contextlib.suppress(Exception):
            chat_span.set_attribute("agent.mode", chat_agent_mode)
        _t0 = time.perf_counter()
        agent_factory = (
            create_multi_agent_chat_deep_agent
@ -2695,6 +2823,14 @@ async def stream_resume_chat(
                    llm_config_id,
                    time.perf_counter() - _t0,
                )
                ot.add_event(
                    "chat.rate_limit.recovered",
                    {
                        "recovery.reason": "provider_rate_limited",
                        "recovery.previous_config_id": previous_config_id,
                        "recovery.fallback_config_id": llm_config_id,
                    },
                )
                _log_chat_stream_error(
                    flow="resume",
                    error_kind="rate_limited",
@ -2722,6 +2858,12 @@ async def stream_resume_chat(
            chat_id,
        )
        if stream_result.is_interrupted:
            ot.add_event(
                "chat.interrupted",
                {
                    "chat.flow": "resume",
                },
            )
            usage_summary = accumulator.per_message_summary()
            _perf_log.info(
                "[token_usage] interrupted resume_chat: calls=%d total=%d cost_micros=%d summary=%s",
@ -2815,6 +2957,12 @@ async def stream_resume_chat(
            user_message,
            error_extra,
        ) = _classify_stream_exception(e, flow_label="resume")
        chat_outcome = error_code or error_kind or "error"
        chat_error_category = ot_metrics.categorize_exception(e)
        with contextlib.suppress(Exception):
            chat_span.set_attribute("chat.outcome", chat_outcome)
            chat_span.set_attribute("error.category", chat_error_category)
            ot.record_error(chat_span, e)
        error_message = f"Error during resume: {e!s}"
        print(f"[stream_resume_chat] {error_message}")
        print(f"[stream_resume_chat] Traceback:\n{traceback.format_exc()}")
@ -2964,3 +3112,18 @@ async def stream_resume_chat(
            )
        trim_native_heap()
        log_system_snapshot("stream_resume_chat_END")
        with contextlib.suppress(Exception):
            chat_span.set_attribute("chat.outcome", chat_outcome)
            ot_metrics.record_chat_request_duration(
                (time.perf_counter() - _t_total) * 1000,
                flow="resume",
                outcome=chat_outcome,
                agent_mode=chat_agent_mode,
            )
            ot_metrics.record_chat_request_outcome(
                flow="resume",
                outcome=chat_outcome,
                agent_mode=chat_agent_mode,
                error_category=chat_error_category,
            )
        chat_span_cm.__exit__(*sys.exc_info())
--- a/surfsense_backend/app/utils/perf.py
+++ b/surfsense_backend/app/utils/perf.py
@ -16,6 +16,8 @@ import time
 from contextlib import asynccontextmanager, contextmanager
 from typing import Any
 from app.observability import metrics as ot_metrics
 _perf_log: logging.Logger | None = None
 _last_rss_mb: float = 0.0
@ -50,6 +52,7 @@ def perf_timer(label: str, *, extra: dict[str, Any] | None = None):
    if extra:
        suffix = " " + " ".join(f"{k}={v}" for k, v in extra.items())
    log.info("%s in %.3fs%s", label, elapsed, suffix)
    ot_metrics.record_perf_elapsed(elapsed * 1000, label=label)
@asynccontextmanager
@ -68,6 +71,7 @@ async def perf_async_timer(label: str, *, extra: dict[str, Any] | None = None):
    if extra:
        suffix = " " + " ".join(f"{k}={v}" for k, v in extra.items())
    log.info("%s in %.3fs%s", label, elapsed, suffix)
    ot_metrics.record_perf_elapsed(elapsed * 1000, label=label)
 def system_snapshot() -> dict[str, Any]:
--- a/surfsense_backend/main.py
+++ b/surfsense_backend/main.py
@ -12,9 +12,26 @@ if sys.platform == "win32":
 from app.config.uvicorn import load_uvicorn_config
 _old_log_record_factory = logging.getLogRecordFactory()
 def _otel_safe_log_record_factory(*args, **kwargs):
    record = _old_log_record_factory(*args, **kwargs)
    if not hasattr(record, "otelTraceID"):
        record.otelTraceID = "0"
    if not hasattr(record, "otelSpanID"):
        record.otelSpanID = "0"
    return record
 logging.setLogRecordFactory(_otel_safe_log_record_factory)
 logging.basicConfig(
    level=logging.INFO,
-    format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
+    format=(
        "%(asctime)s - %(name)s - %(levelname)s - "
        "[trace_id=%(otelTraceID)s span_id=%(otelSpanID)s] %(message)s"
    ),
    datefmt="%Y-%m-%d %H:%M:%S",
 )
--- a/surfsense_backend/pyproject.toml
+++ b/surfsense_backend/pyproject.toml
@ -76,6 +76,17 @@ dependencies = [
    "litellm>=1.83.7",
    "langchain-litellm>=0.6.4",
    "deepagents>=0.4.12,<0.5",
    "opentelemetry-api>=1.40.0",
    "opentelemetry-sdk>=1.40.0",
    "opentelemetry-exporter-otlp>=1.40.0",
    "opentelemetry-semantic-conventions>=0.61b0",
    "opentelemetry-instrumentation-fastapi>=0.61b0",
    "opentelemetry-instrumentation-sqlalchemy>=0.61b0",
    "opentelemetry-instrumentation-psycopg>=0.61b0",
    "opentelemetry-instrumentation-redis>=0.61b0",
    "opentelemetry-instrumentation-httpx>=0.61b0",
    "opentelemetry-instrumentation-celery>=0.61b0",
    "opentelemetry-instrumentation-logging>=0.61b0",
 ]
 [dependency-groups]
--- a/surfsense_backend/tests/unit/agents/new_chat/test_otel_span.py
+++ b/surfsense_backend/tests/unit/agents/new_chat/test_otel_span.py
@ -23,6 +23,7 @@ pytestmark = pytest.mark.unit
@pytest.fixture(autouse=True)
 def _disable_otel(monkeypatch: pytest.MonkeyPatch):
    monkeypatch.delenv("OTEL_EXPORTER_OTLP_ENDPOINT", raising=False)
    monkeypatch.delenv("OTEL_SDK_DISABLED", raising=False)
    monkeypatch.setenv("SURFSENSE_DISABLE_OTEL", "true")
    from app.observability import otel as ot
@ -99,16 +100,17 @@ class TestAnnotateModelResponse:
                "total_tokens": 150,
            },
        )
-        _annotate_model_response(sp, msg)
+        assert _annotate_model_response(sp, msg) == (100, 50)
-        sp.set_attribute.assert_any_call("tokens.prompt", 100)
+        sp.set_attribute.assert_any_call("gen_ai.usage.input_tokens", 100)
-        sp.set_attribute.assert_any_call("tokens.completion", 50)
+        sp.set_attribute.assert_any_call("gen_ai.usage.output_tokens", 50)
-        sp.set_attribute.assert_any_call("tokens.total", 150)
+        sp.set_attribute.assert_any_call("gen_ai.usage.total_tokens", 150)
        sp.set_attribute.assert_any_call("gen_ai.operation.name", "chat")
    def test_handles_response_with_no_metadata(self) -> None:
        sp = MagicMock()
        msg = AIMessage(content="hello")
        # Should not raise even when usage_metadata is missing
-        _annotate_model_response(sp, msg)
+        assert _annotate_model_response(sp, msg) == (None, None)
 class TestAnnotateToolResult:
@ -119,7 +121,7 @@ class TestAnnotateToolResult:
            tool_call_id="abc",
            status="success",
        )
-        _annotate_tool_result(sp, result)
+        assert _annotate_tool_result(sp, result) is False
        sp.set_attribute.assert_any_call("tool.output.size", len("result text"))
        sp.set_attribute.assert_any_call("tool.status", "success")
@ -130,7 +132,7 @@ class TestAnnotateToolResult:
            tool_call_id="abc",
            additional_kwargs={"error": {"code": "x"}},
        )
-        _annotate_tool_result(sp, result)
+        assert _annotate_tool_result(sp, result) is True
        sp.set_attribute.assert_any_call("tool.error", True)
@ -193,3 +195,91 @@ class TestMiddlewareIntegration:
            assert result.content == "enabled"
        finally:
            ot.reload_for_tests()
    async def test_enabled_model_call_records_metrics(
        self, monkeypatch: pytest.MonkeyPatch
    ) -> None:
        monkeypatch.delenv("SURFSENSE_DISABLE_OTEL", raising=False)
        monkeypatch.setenv("OTEL_EXPORTER_OTLP_ENDPOINT", "http://localhost:4317")
        from app.observability import otel as ot
        duration_calls: list[dict[str, Any]] = []
        token_calls: list[dict[str, Any]] = []
        monkeypatch.setattr(
            "app.agents.new_chat.middleware.otel_span.ot_metrics.record_model_call_duration",
            lambda duration_ms, **attrs: duration_calls.append(
                {"duration_ms": duration_ms, **attrs}
            ),
        )
        monkeypatch.setattr(
            "app.agents.new_chat.middleware.otel_span.ot_metrics.record_model_token_usage",
            lambda **attrs: token_calls.append(attrs),
        )
        ot.reload_for_tests()
        try:
            mw = OtelSpanMiddleware()
            async def handler(req):
                return AIMessage(
                    content="enabled",
                    usage_metadata={
                        "input_tokens": 3,
                        "output_tokens": 5,
                        "total_tokens": 8,
                    },
                )
            request = MagicMock()
            request.model = MagicMock()
            request.model.model_name = "gpt-4o"
            request.model.provider = "openai"
            await mw.awrap_model_call(request, handler)
            assert duration_calls
            assert token_calls == [
                {
                    "input_tokens": 3,
                    "output_tokens": 5,
                    "model": "gpt-4o",
                    "provider": "openai",
                }
            ]
        finally:
            ot.reload_for_tests()
    async def test_enabled_tool_call_records_error_metric(
        self, monkeypatch: pytest.MonkeyPatch
    ) -> None:
        monkeypatch.delenv("SURFSENSE_DISABLE_OTEL", raising=False)
        monkeypatch.setenv("OTEL_EXPORTER_OTLP_ENDPOINT", "http://localhost:4317")
        from app.observability import otel as ot
        errors: list[str] = []
        monkeypatch.setattr(
            "app.agents.new_chat.middleware.otel_span.ot_metrics.record_tool_call_error",
            lambda *, tool_name: errors.append(tool_name),
        )
        monkeypatch.setattr(
            "app.agents.new_chat.middleware.otel_span.ot_metrics.record_tool_call_duration",
            lambda *args, **kwargs: None,
        )
        ot.reload_for_tests()
        try:
            mw = OtelSpanMiddleware()
            async def handler(req):
                return ToolMessage(
                    content="failed",
                    tool_call_id="abc",
                    status="error",
                )
            request = MagicMock()
            request.tool = MagicMock()
            request.tool.name = "web_search"
            await mw.awrap_tool_call(request, handler)
            assert errors == ["web_search"]
        finally:
            ot.reload_for_tests()
--- a/surfsense_backend/tests/unit/observability/test_helpers.py
+++ b/surfsense_backend/tests/unit/observability/test_helpers.py
@ -0,0 +1,101 @@
 """Tests for pure observability helper functions."""
 from __future__ import annotations
 import pytest
 from app.observability import metrics as ot_metrics, otel as ot
 pytestmark = pytest.mark.unit
@pytest.fixture(autouse=True)
 def _disable_otel(monkeypatch: pytest.MonkeyPatch):
    monkeypatch.delenv("OTEL_EXPORTER_OTLP_ENDPOINT", raising=False)
    monkeypatch.setenv("SURFSENSE_DISABLE_OTEL", "true")
    ot.reload_for_tests()
    yield
    ot.reload_for_tests()
@pytest.mark.parametrize(
    ("task_name", "expected"),
    [
        ("reindex_document", "reindex"),
        ("delete_document_background", "delete"),
        ("delete_folder_documents_background", "delete"),
        ("delete_search_space_background", "delete"),
        ("process_extension_document", "process"),
        ("process_youtube_video", "process"),
        ("process_file_upload", "process"),
        ("process_file_upload_with_document", "process"),
        ("process_circleback_meeting", "process"),
        ("generate_video_presentation", "generate"),
        ("generate_content_podcast", "generate"),
        ("cleanup_stale_indexing_notifications", "cleanup"),
        ("reconcile_pending_stripe_page_purchases", "reconcile"),
        ("reconcile_pending_stripe_token_purchases", "reconcile"),
        ("check_periodic_schedules", "check"),
        ("ai_sort_search_space", "ai"),
        ("index_notion_pages", "index"),
        ("index_github_repos", "index"),
        ("index_google_drive_files", "index"),
        ("index_composio_connector", "index"),
        ("index_obsidian_attachment", "index"),
        ("index_local_folder", "index"),
        ("index_uploaded_folder_files", "index"),
        ("noseparator", "noseparator"),
        ("", "unknown"),
    ],
 )
 def test_parse_celery_task_label(task_name: str, expected: str) -> None:
    assert ot_metrics.parse_celery_task_label(task_name) == expected
 def test_parse_celery_task_label_handles_none() -> None:
    assert ot_metrics.parse_celery_task_label(None) == "unknown"
@pytest.mark.parametrize(
    ("exc", "expected"),
    [
        (type("RateLimitError", (Exception,), {})(), "rate_limited"),
        (type("AuthenticationError", (Exception,), {})(), "auth_failed"),
        (type("QuotaInsufficientError", (Exception,), {})(), "quota_exhausted"),
        (TimeoutError(), "timeout"),
        (type("APIConnectionError", (Exception,), {})(), "network_failed"),
        (type("ServiceUnavailableError", (Exception,), {})(), "server_error"),
        (type("LockContentionError", (Exception,), {})(), "lock_contention"),
        (type("UnsupportedFormatError", (Exception,), {})(), "unsupported_format"),
        (type("ProviderError", (Exception,), {})(), "provider_error"),
        (RuntimeError("plain"), "unknown"),
    ],
 )
 def test_categorize_exception(exc: BaseException, expected: str) -> None:
    assert ot_metrics.categorize_exception(exc) == expected
 def test_record_celery_queue_latency_noops_when_disabled() -> None:
    ot_metrics.record_celery_queue_latency(
        0.5,
        task_name="index_notion_pages",
        queue="surfsense.connectors",
        scheduled=False,
        operation="index",
    )
 def test_add_event_noops_when_disabled() -> None:
    ot.add_event("test.event", {"value": 1})
 def test_add_event_noops_without_current_span(monkeypatch: pytest.MonkeyPatch) -> None:
    class FakeTrace:
        @staticmethod
        def get_current_span():
            return None
    monkeypatch.setattr(ot, "_ENABLED", True)
    monkeypatch.setattr(ot, "_ot_trace", FakeTrace())
    ot.add_event("test.event", {"value": 1})
--- a/surfsense_backend/tests/unit/observability/test_otel.py
+++ b/surfsense_backend/tests/unit/observability/test_otel.py
@ -4,7 +4,7 @@ from __future__ import annotations
 import pytest
-from app.observability import otel
+from app.observability import bootstrap, metrics, otel
 pytestmark = pytest.mark.unit
@ -12,7 +12,14 @@ pytestmark = pytest.mark.unit
@pytest.fixture(autouse=True)
 def _reset_otel_state(monkeypatch: pytest.MonkeyPatch):
    """Force a clean OTel disabled state per test, then restore after."""
-    for env in ("OTEL_EXPORTER_OTLP_ENDPOINT", "SURFSENSE_DISABLE_OTEL"):
+    for env in (
        "OTEL_EXPORTER_OTLP_ENDPOINT",
        "OTEL_EXPORTER_OTLP_PROTOCOL",
        "OTEL_EXPORTER_OTLP_TRACES_ENDPOINT",
        "OTEL_EXPORTER_OTLP_METRICS_ENDPOINT",
        "SURFSENSE_DISABLE_OTEL",
        "OTEL_SDK_DISABLED",
    ):
        monkeypatch.delenv(env, raising=False)
    monkeypatch.setenv("SURFSENSE_DISABLE_OTEL", "true")
    otel.reload_for_tests()
@ -36,6 +43,195 @@ def test_kill_switch_overrides_endpoint(monkeypatch: pytest.MonkeyPatch) -> None
    assert otel.reload_for_tests() is False
 def test_spec_kill_switch_overrides_endpoint(monkeypatch: pytest.MonkeyPatch) -> None:
    monkeypatch.delenv("SURFSENSE_DISABLE_OTEL", raising=False)
    monkeypatch.setenv("OTEL_EXPORTER_OTLP_ENDPOINT", "http://localhost:4317")
    monkeypatch.setenv("OTEL_SDK_DISABLED", "true")
    assert otel.reload_for_tests() is False
 class TestBootstrapConfig:
    def test_disabled_checks_both_kill_switches(
        self, monkeypatch: pytest.MonkeyPatch
    ) -> None:
        monkeypatch.delenv("SURFSENSE_DISABLE_OTEL", raising=False)
        monkeypatch.delenv("OTEL_SDK_DISABLED", raising=False)
        assert bootstrap.is_otel_disabled() is False
        monkeypatch.setenv("OTEL_SDK_DISABLED", "on")
        assert bootstrap.is_otel_disabled() is True
    def test_configured_by_shared_or_signal_endpoint(
        self, monkeypatch: pytest.MonkeyPatch
    ) -> None:
        monkeypatch.delenv("SURFSENSE_DISABLE_OTEL", raising=False)
        assert bootstrap.is_otel_configured() is False
        monkeypatch.setenv(
            "OTEL_EXPORTER_OTLP_TRACES_ENDPOINT", "http://localhost:4317"
        )
        assert bootstrap.is_otel_configured() is True
    def test_init_otel_noops_when_disabled(
        self, monkeypatch: pytest.MonkeyPatch
    ) -> None:
        called = {"traces": False}
        def fake_init_traces(app=None):
            del app
            called["traces"] = True
        monkeypatch.setenv("SURFSENSE_DISABLE_OTEL", "true")
        monkeypatch.setenv("OTEL_EXPORTER_OTLP_ENDPOINT", "http://localhost:4317")
        monkeypatch.setattr(bootstrap, "init_traces", fake_init_traces)
        bootstrap.init_otel()
        assert called["traces"] is False
    def test_init_otel_dispatches_enabled_signals(
        self, monkeypatch: pytest.MonkeyPatch
    ) -> None:
        called: list[str] = []
        monkeypatch.delenv("SURFSENSE_DISABLE_OTEL", raising=False)
        monkeypatch.setenv("OTEL_EXPORTER_OTLP_ENDPOINT", "http://localhost:4317")
        monkeypatch.setattr(
            bootstrap, "init_traces", lambda app=None: called.append("traces")
        )
        monkeypatch.setattr(bootstrap, "init_metrics", lambda: called.append("metrics"))
        monkeypatch.setattr(bootstrap, "init_logs", lambda: called.append("logs"))
        bootstrap.init_otel()
        assert called == ["traces", "metrics", "logs"]
    def test_resource_defaults_include_service_metadata(
        self, monkeypatch: pytest.MonkeyPatch
    ) -> None:
        monkeypatch.setenv("OTEL_SERVICE_NAME", "custom-backend")
        monkeypatch.setenv("SURFSENSE_ENV", "test")
        resource = bootstrap._build_resource()
        attrs = dict(resource.attributes)
        assert attrs["service.name"] == "custom-backend"
        assert attrs["deployment.environment.name"] == "test"
        assert attrs["deployment.environment"] == "test"
        assert attrs["service.instance.id"]
    def test_deployment_environment_uses_surfsense_env_only(
        self, monkeypatch: pytest.MonkeyPatch
    ) -> None:
        monkeypatch.delenv("SURFSENSE_ENV", raising=False)
        assert bootstrap._deployment_environment() == "dev"
        monkeypatch.setenv("SURFSENSE_ENV", "production")
        assert bootstrap._deployment_environment() == "production"
    def test_shutdown_is_safe_without_providers(self) -> None:
        bootstrap.shutdown_otel()
    def test_init_logs_enables_log_correlation(
        self, monkeypatch: pytest.MonkeyPatch
    ) -> None:
        calls: list[dict[str, object]] = []
        class FakeLoggingInstrumentor:
            def instrument(self, **kwargs: object) -> None:
                calls.append(kwargs)
        def fake_safe_instrument(name: str, callback):
            assert name == "logging"
            monkeypatch.setattr(
                "opentelemetry.instrumentation.logging.LoggingInstrumentor",
                FakeLoggingInstrumentor,
            )
            callback()
            return True
        monkeypatch.setattr(bootstrap, "_LOGS_INITIALIZED", False)
        monkeypatch.setattr(bootstrap, "_safe_instrument", fake_safe_instrument)
        bootstrap.init_logs()
        assert calls == [{"set_logging_format": True}]
 class TestMetricHelpers:
    def test_all_metric_helpers_noop_safely_when_disabled(self) -> None:
        metrics.record_model_call_duration(12.5, model="gpt-4o", provider="openai")
        metrics.record_model_token_usage(
            input_tokens=10,
            output_tokens=5,
            model="gpt-4o",
            provider="openai",
        )
        metrics.record_tool_call_duration(3.0, tool_name="web_search")
        metrics.record_tool_call_error(tool_name="web_search")
        metrics.record_kb_search_duration(
            4.0,
            search_space_id=1,
            surface="documents",
        )
        metrics.record_compaction_run(reason="auto")
        metrics.record_permission_ask(permission="write_file")
        metrics.record_interrupt(interrupt_type="permission_ask")
        metrics.record_indexing_document_duration(1.2, document_type="FILE")
        metrics.record_indexing_document_outcome(document_type="FILE", status="success")
        metrics.record_connector_sync_duration(
            2.3,
            connector_type="index_notion_pages",
        )
        metrics.record_connector_sync_outcome(
            connector_type="index_notion_pages",
            status="success",
        )
        metrics.record_auth_failure(reason="UNAUTHORIZED")
        metrics.record_rate_limit_rejection(scope="login")
        metrics.record_perf_elapsed(7.0, label="[test]")
    def test_runtime_observables_register_once(
        self, monkeypatch: pytest.MonkeyPatch
    ) -> None:
        class FakeMeter:
            def __init__(self) -> None:
                self.names: list[str] = []
            def create_observable_gauge(self, name: str, **kwargs) -> None:
                del kwargs
                self.names.append(name)
        fake_meter = FakeMeter()
        monkeypatch.setattr(metrics, "_OBSERVABLES_REGISTERED", False)
        monkeypatch.setattr(metrics, "_is_enabled", lambda: True)
        monkeypatch.setattr(metrics, "_get_meter", lambda: fake_meter)
        metrics.register_runtime_observables()
        metrics.register_runtime_observables()
        assert len(fake_meter.names) == 6
        assert fake_meter.names.count("python.asyncio.tasks") == 1
        monkeypatch.setattr(metrics, "_OBSERVABLES_REGISTERED", False)
 def test_log_record_factory_provides_zero_otel_fields() -> None:
    import logging
    import main  # noqa: F401
    record = logging.getLogRecordFactory()(
        "test",
        logging.INFO,
        __file__,
        1,
        "hello",
        (),
        None,
    )
    assert record.otelTraceID == "0"
    assert record.otelSpanID == "0"
 class TestNoopSpansWhenDisabled:
    def test_generic_span_yields_noop(self) -> None:
        with otel.span("any.thing", attributes={"x": 1}) as sp:
--- a/surfsense_backend/tests/unit/observability/test_retriever_otel.py
+++ b/surfsense_backend/tests/unit/observability/test_retriever_otel.py
@ -0,0 +1,61 @@
 """Tests for retriever OTel wrappers."""
 from __future__ import annotations
 from contextlib import contextmanager
 from typing import Any
 import pytest
 from app.retriever.documents_hybrid_search import _instrument_search
 pytestmark = pytest.mark.unit
 class _Span:
    def __init__(self) -> None:
        self.attrs: dict[str, Any] = {}
    def set_attribute(self, key: str, value: Any) -> None:
        self.attrs[key] = value
@contextmanager
 def _fake_span(**kwargs):
    span = _Span()
    span.attrs.update(kwargs)
    yield span
@pytest.mark.asyncio
 async def test_retriever_wrapper_records_one_span_and_metric(monkeypatch) -> None:
    calls: list[dict[str, Any]] = []
    monkeypatch.setattr(
        "app.retriever.documents_hybrid_search.ot.kb_search_span",
        lambda **kwargs: _fake_span(**kwargs),
    )
    monkeypatch.setattr(
        "app.retriever.documents_hybrid_search.ot_metrics.record_kb_search_duration",
        lambda duration_ms, **attrs: calls.append(
            {"duration_ms": duration_ms, **attrs}
        ),
    )
    class Retriever:
        @_instrument_search("hybrid")
        async def search(
            self,
            query_text: str,
            top_k: int,
            search_space_id: int,
        ) -> list[str]:
            del query_text, top_k, search_space_id
            return ["doc-1", "doc-2"]
    result = await Retriever().search("hello", 3, 42)
    assert result == ["doc-1", "doc-2"]
    assert len(calls) == 1
    assert calls[0]["search_space_id"] == 42
    assert calls[0]["surface"] == "documents"
--- a/surfsense_backend/uv.lock
+++ b/surfsense_backend/uv.lock
--- a/surfsense_web/app/(home)/login/GoogleLoginButton.tsx
+++ b/surfsense_web/app/(home)/login/GoogleLoginButton.tsx
@ -3,9 +3,9 @@ import { useTranslations } from "next-intl";
 import { useState } from "react";
 import { Logo } from "@/components/Logo";
 import { Button } from "@/components/ui/button";
 import { BACKEND_URL } from "@/lib/env-config";
 import { trackLoginAttempt } from "@/lib/posthog/events";
 import { AmbientBackground } from "./AmbientBackground";
 import { BACKEND_URL } from "@/lib/env-config";
 function GoogleGLogo({ className }: { className?: string }) {
 	return (
--- a/surfsense_web/components/assistant-ui/chat-viewport.tsx
+++ b/surfsense_web/components/assistant-ui/chat-viewport.tsx
@ -3,16 +3,19 @@
 import { ThreadPrimitive } from "@assistant-ui/react";
 import { ArrowDownIcon } from "lucide-react";
 import type { FC, ReactNode } from "react";
-import { TooltipIconButton } from "@/components/assistant-ui/tooltip-icon-button";
+import { Button } from "@/components/ui/button";
 const ChatScrollToBottom: FC = () => (
 	<ThreadPrimitive.ScrollToBottom asChild>
-		<TooltipIconButton
+		<Button
-			tooltip="Scroll to bottom"
+			type="button"
-			className="aui-thread-scroll-to-bottom -top-12 absolute z-10 self-center rounded-full border-0 bg-muted p-4 text-foreground hover:bg-accent hover:text-accent-foreground disabled:invisible"
+			variant="ghost"
 			size="icon"
 			aria-label="Scroll to bottom"
 			className="aui-thread-scroll-to-bottom -top-12 absolute z-10 size-10 self-center rounded-full border border-input bg-muted p-0 text-foreground shadow-sm shadow-black/5 hover:bg-accent hover:text-accent-foreground disabled:invisible dark:shadow-black/10"
 		>
 			<ArrowDownIcon />
-		</TooltipIconButton>
+		</Button>
 	</ThreadPrimitive.ScrollToBottom>
 );
--- a/surfsense_web/components/assistant-ui/connector-popup.tsx
+++ b/surfsense_web/components/assistant-ui/connector-popup.tsx
@ -1,7 +1,7 @@
 "use client";
 import { useAtomValue } from "jotai";
-import { AlertTriangle, Settings } from "lucide-react";
+import { AlertTriangle } from "lucide-react";
 import { useRouter } from "next/navigation";
 import { forwardRef, useEffect, useImperativeHandle, useMemo, useState } from "react";
 import { createPortal } from "react-dom";
@ -381,7 +381,7 @@ export const ConnectorIndicator = forwardRef<ConnectorIndicatorHandle, Connector
 										{/* LLM Configuration Warning */}
 										{!llmConfigLoading && !hasDocumentSummaryLLM && (
 											<div className="mb-6">
-												<Alert variant="destructive">
+												<Alert variant="warning">
 													<AlertTriangle />
 													<AlertTitle>LLM Configuration Required</AlertTitle>
 													<AlertDescription>
@ -392,7 +392,7 @@ export const ConnectorIndicator = forwardRef<ConnectorIndicatorHandle, Connector
 														</p>
 														<Button
 															size="sm"
-															variant="outline"
+															variant="secondary"
 															onClick={() => {
 																handleOpenChange(false);
 																router.push(
@ -400,7 +400,6 @@ export const ConnectorIndicator = forwardRef<ConnectorIndicatorHandle, Connector
 																);
 															}}
 														>
 															<Settings className="mr-2 h-4 w-4" />
 															Go to Settings
 														</Button>
 													</AlertDescription>
--- a/surfsense_web/components/assistant-ui/document-upload-popup.tsx
+++ b/surfsense_web/components/assistant-ui/document-upload-popup.tsx
@ -1,7 +1,7 @@
 "use client";
 import { useAtomValue } from "jotai";
-import { AlertTriangle, Settings } from "lucide-react";
+import { AlertTriangle } from "lucide-react";
 import { useRouter } from "next/navigation";
 import {
 	createContext,
@ -148,7 +148,7 @@ const DocumentUploadPopupContent: FC<{
 					<div className="px-4 sm:px-6 pb-4 sm:pb-6">
 						{!isLoading && !hasDocumentSummaryLLM ? (
 							<div className="mb-4">
-								<Alert variant="destructive">
+								<Alert variant="warning">
 									<AlertTriangle />
 									<AlertTitle>LLM Configuration Required</AlertTitle>
 									<AlertDescription>
@ -159,13 +159,12 @@ const DocumentUploadPopupContent: FC<{
 										</p>
 										<Button
 											size="sm"
-											variant="outline"
+											variant="secondary"
 											onClick={() => {
 												onOpenChange(false);
 												router.push(`/dashboard/${searchSpaceId}/search-space-settings/models`);
 											}}
 										>
 											<Settings className="mr-2 h-4 w-4" />
 											Go to Settings
 										</Button>
 									</AlertDescription>
--- a/surfsense_web/components/layout/ui/icon-rail/SearchSpaceAvatar.tsx
+++ b/surfsense_web/components/layout/ui/icon-rail/SearchSpaceAvatar.tsx
@ -2,14 +2,9 @@
 import { Settings, Trash2, Users } from "lucide-react";
 import { useTranslations } from "next-intl";
 import type { MouseEvent } from "react";
 import { useCallback, useRef, useState } from "react";
 import { Button } from "@/components/ui/button";
 import {
 	ContextMenu,
 	ContextMenuContent,
 	ContextMenuItem,
 	ContextMenuTrigger,
 } from "@/components/ui/context-menu";
 import {
 	DropdownMenu,
 	DropdownMenuContent,
@ -80,19 +75,28 @@ export function SearchSpaceAvatar({
 	const initials = getInitials(name);
 	const sizeClasses = size === "sm" ? "h-8 w-8 text-xs" : "h-10 w-10 text-sm";
-	// Long-press state for mobile
+	const [menuOpen, setMenuOpen] = useState(false);
 	const [longPressMenuOpen, setLongPressMenuOpen] = useState(false);
 	const longPressTimer = useRef<ReturnType<typeof setTimeout> | null>(null);
 	const touchMoved = useRef(false);
 	const openMenu = useCallback(() => {
 		setMenuOpen(true);
 	}, []);
 	const handleContextMenu = useCallback((event: MouseEvent<HTMLButtonElement>) => {
 		event.preventDefault();
 		event.stopPropagation();
 		setMenuOpen(true);
 	}, []);
 	const handleTouchStart = useCallback(() => {
 		touchMoved.current = false;
 		longPressTimer.current = setTimeout(() => {
 			if (!touchMoved.current) {
-				setLongPressMenuOpen(true);
+				openMenu();
 			}
 		}, 500);
-	}, []);
+	}, [openMenu]);
 	const handleTouchMove = useCallback(() => {
 		touchMoved.current = true;
@ -120,12 +124,26 @@ export function SearchSpaceAvatar({
 		</div>
 	);
-	const avatarButton = (
+	const avatarButton = (withMenuHandlers = false) => (
 		<Button
 			type="button"
 			variant="ghost"
 			size="icon"
 			onClick={onClick}
 			onPointerDown={
 				withMenuHandlers
 					? (event) => {
 							if (event.button === 0) {
 								event.preventDefault();
 							}
 						}
 					: undefined
 			}
 			onContextMenu={withMenuHandlers ? handleContextMenu : undefined}
 			onTouchStart={withMenuHandlers ? handleTouchStart : undefined}
 			onTouchMove={withMenuHandlers ? handleTouchMove : undefined}
 			onTouchEnd={withMenuHandlers ? handleTouchEnd : undefined}
 			onTouchCancel={withMenuHandlers ? handleTouchEnd : undefined}
 			className={cn(
 				"relative rounded-lg font-semibold text-white transition-all select-none",
 				"hover:text-white hover:opacity-90 focus-visible:outline-none focus-visible:ring-2 focus-visible:ring-ring",
@ -173,73 +191,41 @@ export function SearchSpaceAvatar({
 		</>
 	);
-	// If delete or settings handlers are provided, wrap with context menu
+	// If delete or settings handlers are provided, expose them through a dropdown menu.
 	if (onDelete || onSettings) {
-		// Mobile: use long-press triggered DropdownMenu
+		const trigger = (
 		if (disableTooltip) {
 			return (
 				<DropdownMenu open={longPressMenuOpen} onOpenChange={setLongPressMenuOpen}>
 			<DropdownMenuTrigger asChild>
-						<div
+				{avatarButton(true)}
 							className="inline-block"
 							onTouchStart={handleTouchStart}
 							onTouchMove={handleTouchMove}
 							onTouchEnd={handleTouchEnd}
 							onTouchCancel={handleTouchEnd}
 						>
 							{avatarButton}
 						</div>
 			</DropdownMenuTrigger>
 					<DropdownMenuContent>{menuItems}</DropdownMenuContent>
 				</DropdownMenu>
 		);
 		}
 		// Desktop: use right-click ContextMenu + Tooltip
 		return (
-			<ContextMenu>
+			<DropdownMenu open={menuOpen} onOpenChange={setMenuOpen}>
 				{disableTooltip ? (
 					trigger
 				) : (
 					<Tooltip>
-					<TooltipTrigger asChild>
+						<TooltipTrigger asChild>{trigger}</TooltipTrigger>
 						<ContextMenuTrigger asChild>
 							<div className="inline-block">{avatarButton}</div>
 						</ContextMenuTrigger>
 					</TooltipTrigger>
 						<TooltipContent side="right" sideOffset={8}>
 							{tooltipContent}
 						</TooltipContent>
 					</Tooltip>
 				<ContextMenuContent>
 					{onSettings && (
 						<ContextMenuItem onClick={onSettings}>
 							<Settings className="mr-2 h-4 w-4" />
 							{tCommon("settings")}
 						</ContextMenuItem>
 				)}
-					{onDelete && isOwner && (
+				<DropdownMenuContent side="right" align="start">
-						<ContextMenuItem onClick={onDelete}>
+					{menuItems}
-							<Trash2 className="mr-2 h-4 w-4" />
+				</DropdownMenuContent>
-							{tCommon("delete")}
+			</DropdownMenu>
 						</ContextMenuItem>
 					)}
 					{onDelete && !isOwner && (
 						<ContextMenuItem onClick={onDelete}>
 							<Trash2 className="mr-2 h-4 w-4" />
 							{t("leave")}
 						</ContextMenuItem>
 					)}
 				</ContextMenuContent>
 			</ContextMenu>
 		);
 	}
 	// No context menu needed
 	if (disableTooltip) {
-		return avatarButton;
+		return avatarButton();
 	}
 	return (
 		<Tooltip>
-			<TooltipTrigger asChild>{avatarButton}</TooltipTrigger>
+			<TooltipTrigger asChild>{avatarButton()}</TooltipTrigger>
 			<TooltipContent side="right" sideOffset={8}>
 				{tooltipContent}
 			</TooltipContent>
--- a/surfsense_web/components/ui/button.tsx
+++ b/surfsense_web/components/ui/button.tsx
@ -14,7 +14,7 @@ const buttonVariants = cva(
 					"bg-destructive text-white shadow-xs hover:bg-destructive/90 focus-visible:ring-destructive/20 dark:focus-visible:ring-destructive/40",
 				outline:
 					"border border-input bg-background shadow-xs hover:bg-accent hover:text-accent-foreground",
-				secondary: "bg-secondary text-secondary-foreground shadow-xs hover:bg-secondary/80",
+				secondary: "bg-accent text-accent-foreground shadow-xs hover:bg-accent/80",
 				ghost:
 					"hover:bg-accent hover:text-accent-foreground focus-visible:ring-0 focus-visible:ring-offset-0",
 				link: "text-primary underline-offset-4 hover:underline",
--- a/surfsense_web/content/docs/meta.json
+++ b/surfsense_web/content/docs/meta.json
@ -13,6 +13,7 @@
 		"how-to",
 		"---Developers---",
 		"testing",
 		"observability",
 		"code-of-conduct"
 	]
 }
--- a/surfsense_web/content/docs/observability.mdx
+++ b/surfsense_web/content/docs/observability.mdx
@ -0,0 +1,167 @@
 ---
 title: Observability
 description: Configure backend traces and metrics for SurfSense
 icon: Radar
 ---
 SurfSense uses OpenTelemetry for backend traces and metrics. Application logs
 include trace and span IDs so you can correlate logs with traces, but logs stay
 on the normal container stderr path.
 ## Enable Locally
 The development compose file reads backend settings from
 `surfsense_backend/.env`. Add these values there:
 ```dotenv
 SURFSENSE_ENABLE_OTEL=true
 SURFSENSE_ENV=dev
 OTEL_EXPORTER_OTLP_ENDPOINT=http://otel-lgtm:4317
 OTEL_EXPORTER_OTLP_PROTOCOL=grpc
 OTEL_RESOURCE_ATTRIBUTES=service.namespace=surfsense
 OTEL_METRIC_EXPORT_INTERVAL=300000
 ```
 Then start the development stack with the local LGTM backend:
 ```bash
 docker compose -f docker/docker-compose.dev.yml up --build
 ```
 Grafana is exposed on `http://localhost:3001` by default.
 ## Enable in Production Docker Compose
 Production Docker Compose reads backend and collector settings from
 `docker/.env`. The API and Celery worker export telemetry to the bundled
 collector at `otel-collector:4317`; the collector is the only service that uses
 the Grafana Cloud credentials.
 Add these values to `docker/.env`:
 ```dotenv
 SURFSENSE_ENV=production
 SURFSENSE_ENABLE_OTEL=true
 OTEL_EXPORTER_OTLP_ENDPOINT=http://otel-collector:4317
 OTEL_EXPORTER_OTLP_PROTOCOL=grpc
 OTEL_RESOURCE_ATTRIBUTES=service.namespace=surfsense
 OTEL_METRIC_EXPORT_INTERVAL=300000
 GRAFANA_CLOUD_OTLP_ENDPOINT=https://otlp-gateway-<region>.grafana.net/otlp
 GRAFANA_CLOUD_INSTANCE_ID=<stack instance id>
 GRAFANA_CLOUD_API_KEY=<cloud access policy token>
 ```
 Then start the stack:
 ```bash
 docker compose -f docker/docker-compose.yml --profile observability up -d
 ```
 The collector receives OTLP on `otel-collector:4317`, scrubs sensitive span
 attributes, applies the configured tail-sampling policy, batches exports,
 retries failures, and forwards traces and metrics to Grafana Cloud over OTLP
 HTTP.
 When deploying `surfsense_backend/Dockerfile` directly instead of production
 compose, use the same split: SurfSense containers export to a collector, and
 the collector owns the Grafana Cloud credentials.
 ## Automatic Traces
 When OpenTelemetry is enabled, the backend instruments:
 - FastAPI inbound requests.
 - SQLAlchemy queries from the main async engine and Celery task engine.
 - Raw psycopg calls used by the LangGraph checkpointer.
 - Redis commands.
 - HTTPX outbound requests.
 - Celery producer and worker execution.
 ## Manual Spans
 SurfSense keeps project-specific spans behind `app.observability.otel`:
 - `model.call`
 - `tool.call`
 - `chat.request`
 - `kb.search`
 - `kb.persist`
 - `connector.sync`
 - `subagent.invoke`
 - `etl.extract`
 - `etl.parse`
 - `etl.ocr`
 - `etl.picture.describe`
 - `etl.picture.ocr`
 - `compaction.run`
 - `permission.asked`
 - `interrupt.raised`
 Keep span names and attributes low-cardinality. Do not attach user content,
 prompts, document titles, file paths, user-specific URLs, secrets, or raw
 queries as span attributes.
 ## Metrics
 The OpenTelemetry instrumentors provide HTTP, HTTPX, and Celery runtime
 metrics. SurfSense adds these project metrics from `app.observability.metrics`:
 - `surfsense.model.call.duration`
 - `gen_ai.client.token.usage`
 - `surfsense.tool.call.duration`
 - `surfsense.tool.call.errors`
 - `surfsense.chat.request.duration`
 - `surfsense.chat.request.outcome`
 - `surfsense.kb.search.duration`
 - `surfsense.compaction.runs`
 - `surfsense.permission.asks`
 - `surfsense.interrupt.raised`
 - `surfsense.indexing.document.duration`
 - `surfsense.indexing.document.outcome`
 - `surfsense.connector.sync.duration`
 - `surfsense.connector.sync.outcome`
 - `surfsense.subagent.invoke.duration`
 - `surfsense.subagent.invoke.outcome`
 - `surfsense.etl.extract.duration`
 - `surfsense.etl.extract.outcome`
 - `surfsense.celery.heartbeat.refreshes`
 - `surfsense.celery.heartbeat.failures`
 - `surfsense.celery.queue.latency`
 - `surfsense.auth.failures`
 - `surfsense.rate_limit.rejections`
 - `surfsense.perf.elapsed_ms`
 Runtime gauges include process RSS, CPU utilization, threads, open file
 descriptors, asyncio tasks, and CPython GC counters.
 ## Logs
 `LoggingInstrumentor().instrument()` injects `otelTraceID` and `otelSpanID` into
 standard Python `LogRecord`s. The root log format writes them as
 `trace_id=... span_id=...`.
 SurfSense intentionally does not create an OpenTelemetry `LoggerProvider`,
 `LoggingHandler`, or `OTLPLogExporter`. Container stderr remains the log
 transport.
 ## Verification
 1. Hit a FastAPI endpoint and confirm an inbound server span appears in Grafana.
 2. Run a chat request and confirm `model.call` and `tool.call` child spans.
 3. Run a knowledge-base search and confirm `kb.search` spans and SQL child spans.
 4. Run connector indexing and confirm Celery producer/worker spans share a trace
   ID and connector sync metrics increment.
 5. Confirm `gen_ai.client.token.usage`, model/tool durations, request duration,
   Celery runtime, and runtime gauges appear within one export interval.
 6. Confirm logs emitted inside a traced request show non-zero trace and span IDs.
 ## Out Of Scope
 - Frontend/browser OpenTelemetry.
 - OpenTelemetry log export.
 - Profiling.
 - Production backend selection.
 - Tail-sampling collector configuration.
 - Replacing LangSmith.
 - Vendor SDKs.
--- a/surfsense_web/lib/source.ts
+++ b/surfsense_web/lib/source.ts
@ -7,6 +7,7 @@ import {
 	Download,
 	FlaskConical,
 	Heart,
 	Radar,
 	Unplug,
 	Wrench,
 } from "lucide-react";
@ -26,6 +27,7 @@ const DOCS_ICONS: Record<string, React.ComponentType> = {
 	Download,
 	FlaskConical,
 	Heart,
 	Radar,
 	Unplug,
 	Wrench,
 };