Merge pull request #1427 from AnishSarkar22/feat/opentelemetry

feat: OpenTelemetry integration
This commit is contained in:
Rohan Verma 2026-05-23 15:50:20 -07:00 committed by GitHub
commit da4ba09d88
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
45 changed files with 7554 additions and 4442 deletions

View file

@ -7,6 +7,9 @@
# SurfSense version (use "latest" or a specific version like "0.0.14") # SurfSense version (use "latest" or a specific version like "0.0.14")
SURFSENSE_VERSION=latest SURFSENSE_VERSION=latest
# Deployment environment: dev or production
SURFSENSE_ENV=production
# ------------------------------------------------------------------------------ # ------------------------------------------------------------------------------
# Core Settings # Core Settings
# ------------------------------------------------------------------------------ # ------------------------------------------------------------------------------
@ -304,6 +307,28 @@ STT_SERVICE=local/base
# LANGSMITH_API_KEY= # LANGSMITH_API_KEY=
# LANGSMITH_PROJECT=surfsense # LANGSMITH_PROJECT=surfsense
# OpenTelemetry traces and metrics.
# Enable the collector with: docker compose --profile observability up -d
# SURFSENSE_ENABLE_OTEL=true
# OTEL_EXPORTER_OTLP_ENDPOINT=http://otel-collector:4317
# OTEL_EXPORTER_OTLP_PROTOCOL=grpc
# OTEL_RESOURCE_ATTRIBUTES=service.namespace=surfsense
#
# Emergency kill switch.
# OTEL_SDK_DISABLED=true
#
# Grafana Cloud OTLP credentials. These are used only by the collector container.
# GRAFANA_CLOUD_OTLP_ENDPOINT=https://otlp-gateway-<region>.grafana.net/otlp
# GRAFANA_CLOUD_INSTANCE_ID=
# GRAFANA_CLOUD_API_KEY=
#
# Optional host port overrides for the bundled OTel Collector. Only change
# these if the host already uses 4317/4318/13133; backend containers still use
# the internal Docker endpoint above.
# OTEL_GRPC_PORT=4317
# OTEL_HTTP_PORT=4318
# OTEL_HEALTH_PORT=13133
# ------------------------------------------------------------------------------ # ------------------------------------------------------------------------------
# Advanced (optional) # Advanced (optional)
# ------------------------------------------------------------------------------ # ------------------------------------------------------------------------------

View file

@ -78,6 +78,15 @@ services:
timeout: 5s timeout: 5s
retries: 5 retries: 5
otel-lgtm:
image: grafana/otel-lgtm:latest
ports:
- "${OTEL_GRPC_PORT:-4317}:4317"
- "${OTEL_HTTP_PORT:-4318}:4318"
- "${OTEL_GRAFANA_PORT:-3001}:3000"
- "${OTEL_TEMPO_PORT:-3200}:3200"
restart: unless-stopped
searxng: searxng:
image: searxng/searxng:2026.3.13-3c1f68c59 image: searxng/searxng:2026.3.13-3c1f68c59
ports: ports:

View file

@ -61,6 +61,29 @@ services:
timeout: 5s timeout: 5s
retries: 5 retries: 5
otel-collector:
image: otel/opentelemetry-collector-contrib:0.152.1
profiles:
- observability
command: ["--config=/etc/otelcol/config.yaml"]
volumes:
- ./otel-collector/config.yaml:/etc/otelcol/config.yaml:ro
environment:
GRAFANA_CLOUD_OTLP_ENDPOINT: ${GRAFANA_CLOUD_OTLP_ENDPOINT:-}
GRAFANA_CLOUD_INSTANCE_ID: ${GRAFANA_CLOUD_INSTANCE_ID:-}
GRAFANA_CLOUD_API_KEY: ${GRAFANA_CLOUD_API_KEY:-}
ports:
- "${OTEL_GRPC_PORT:-4317}:4317"
- "${OTEL_HTTP_PORT:-4318}:4318"
- "${OTEL_HEALTH_PORT:-13133}:13133"
mem_limit: 2g
restart: unless-stopped
healthcheck:
test: ["CMD", "/otelcol-contrib", "--version"]
interval: 30s
timeout: 5s
retries: 3
searxng: searxng:
image: searxng/searxng:2026.3.13-3c1f68c59 image: searxng/searxng:2026.3.13-3c1f68c59
volumes: volumes:

View file

@ -0,0 +1,81 @@
extensions:
health_check:
endpoint: 0.0.0.0:13133
basicauth/grafana_cloud:
client_auth:
username: ${env:GRAFANA_CLOUD_INSTANCE_ID}
password: ${env:GRAFANA_CLOUD_API_KEY}
receivers:
otlp:
protocols:
grpc:
endpoint: 0.0.0.0:4317
http:
endpoint: 0.0.0.0:4318
processors:
# Percentage limits are calculated against the collector container memory limit.
# Keep docker-compose.yml/Coolify memory limit set for predictability.
memory_limiter:
check_interval: 1s
limit_percentage: 80
spike_limit_percentage: 25
attributes/scrub:
actions:
- key: http.request.header.authorization
action: delete
- key: http.request.header.cookie
action: delete
- key: db.statement
action: delete
tail_sampling:
decision_wait: 10s
num_traces: 50000
expected_new_traces_per_sec: 100
policies:
- name: errors
type: status_code
status_code:
status_codes: [ERROR]
- name: slow-requests
type: latency
latency:
threshold_ms: 500
- name: baseline
type: probabilistic
probabilistic:
sampling_percentage: 100
batch:
timeout: 5s
send_batch_size: 1024
send_batch_max_size: 2048
exporters:
otlp_http/grafana_cloud:
endpoint: ${env:GRAFANA_CLOUD_OTLP_ENDPOINT}
auth:
authenticator: basicauth/grafana_cloud
sending_queue:
enabled: true
queue_size: 10000
retry_on_failure:
enabled: true
initial_interval: 5s
max_interval: 30s
max_elapsed_time: 300s
service:
extensions: [health_check, basicauth/grafana_cloud]
pipelines:
traces:
receivers: [otlp]
processors: [memory_limiter, attributes/scrub, tail_sampling, batch]
exporters: [otlp_http/grafana_cloud]
metrics:
receivers: [otlp]
processors: [memory_limiter, batch]
exporters: [otlp_http/grafana_cloud]

View file

@ -1,5 +1,8 @@
DATABASE_URL=postgresql+asyncpg://postgres:postgres@localhost:5432/surfsense DATABASE_URL=postgresql+asyncpg://postgres:postgres@localhost:5432/surfsense
# Deployment environment: dev or production
SURFSENSE_ENV=dev
#Celery Config #Celery Config
CELERY_BROKER_URL=redis://localhost:6379/0 CELERY_BROKER_URL=redis://localhost:6379/0
CELERY_RESULT_BACKEND=redis://localhost:6379/0 CELERY_RESULT_BACKEND=redis://localhost:6379/0
@ -303,8 +306,16 @@ LANGSMITH_PROJECT=surfsense
# SURFSENSE_ENABLE_BUSY_MUTEX=false # SURFSENSE_ENABLE_BUSY_MUTEX=false
# SURFSENSE_ENABLE_LLM_TOOL_SELECTOR=false # adds a per-turn LLM call # SURFSENSE_ENABLE_LLM_TOOL_SELECTOR=false # adds a per-turn LLM call
# Observability — OTel (also requires OTEL_EXPORTER_OTLP_ENDPOINT) # Observability - OTel
# SURFSENSE_ENABLE_OTEL=false # SURFSENSE_ENABLE_OTEL=false
# OpenTelemetry - endpoint enables export; absent = no-op.
# Production should point at an OTel Collector. For local docker-compose.dev.yml,
# use http://otel-lgtm:4317 instead.
# OTEL_EXPORTER_OTLP_ENDPOINT=http://otel-collector:4317
# OTEL_EXPORTER_OTLP_PROTOCOL=grpc # or http/protobuf
# OTEL_RESOURCE_ATTRIBUTES=service.namespace=surfsense
# OTEL_METRIC_EXPORT_INTERVAL=300000 # ms; 5 minutes
# OTEL_SDK_DISABLED=true # emergency kill-switch
# Skills + subagents # Skills + subagents
# SURFSENSE_ENABLE_SKILLS=false # SURFSENSE_ENABLE_SKILLS=false

View file

@ -20,6 +20,7 @@ from langchain_core.tools import StructuredTool
from langgraph.errors import GraphInterrupt from langgraph.errors import GraphInterrupt
from langgraph.types import Command, Interrupt from langgraph.types import Command, Interrupt
from app.observability import metrics as ot_metrics, otel as ot
from app.utils.perf import get_perf_logger from app.utils.perf import get_perf_logger
from .config import ( from .config import (
@ -173,6 +174,9 @@ def build_task_tool_with_parent_config(
exc_info=True, exc_info=True,
) )
invoke_path = "resume" if pending_value is not None else "fresh"
invoke_start = time.perf_counter()
invoke_outcome = "ok"
if pending_value is not None: if pending_value is not None:
resume_value = consume_surfsense_resume(runtime) resume_value = consume_surfsense_resume(runtime)
if resume_value is None: if resume_value is None:
@ -188,18 +192,94 @@ def build_task_tool_with_parent_config(
# Prevent the parent's resume payload from leaking into subagent # Prevent the parent's resume payload from leaking into subagent
# interrupts via langgraph's parent_scratchpad fallback. # interrupts via langgraph's parent_scratchpad fallback.
drain_parent_null_resume(runtime) drain_parent_null_resume(runtime)
with ot.subagent_invoke_span(
subagent_type=subagent_type, path=invoke_path
) as sp:
try: try:
result = subagent.invoke( result = subagent.invoke(
build_resume_command(resume_value, pending_id), build_resume_command(resume_value, pending_id),
config=sub_config, config=sub_config,
) )
sp.set_attribute("subagent.outcome", invoke_outcome)
except GraphInterrupt as gi: except GraphInterrupt as gi:
invoke_outcome = "interrupted"
sp.set_attribute("subagent.outcome", invoke_outcome)
ot_metrics.record_subagent_invoke_duration(
(time.perf_counter() - invoke_start) * 1000,
subagent_type=subagent_type,
path=invoke_path,
outcome=invoke_outcome,
)
ot_metrics.record_subagent_invoke_outcome(
subagent_type=subagent_type,
path=invoke_path,
outcome=invoke_outcome,
)
_reraise_stamped_subagent_interrupt(gi, runtime.tool_call_id) _reraise_stamped_subagent_interrupt(gi, runtime.tool_call_id)
except Exception:
invoke_outcome = "error"
sp.set_attribute("subagent.outcome", invoke_outcome)
ot_metrics.record_subagent_invoke_duration(
(time.perf_counter() - invoke_start) * 1000,
subagent_type=subagent_type,
path=invoke_path,
outcome=invoke_outcome,
)
ot_metrics.record_subagent_invoke_outcome(
subagent_type=subagent_type,
path=invoke_path,
outcome=invoke_outcome,
)
raise
else: else:
with ot.subagent_invoke_span(
subagent_type=subagent_type, path=invoke_path
) as sp:
try: try:
result = subagent.invoke(subagent_state, config=sub_config) result = subagent.invoke(subagent_state, config=sub_config)
sp.set_attribute("subagent.outcome", invoke_outcome)
except GraphInterrupt as gi: except GraphInterrupt as gi:
invoke_outcome = "interrupted"
sp.set_attribute("subagent.outcome", invoke_outcome)
ot_metrics.record_subagent_invoke_duration(
(time.perf_counter() - invoke_start) * 1000,
subagent_type=subagent_type,
path=invoke_path,
outcome=invoke_outcome,
)
ot_metrics.record_subagent_invoke_outcome(
subagent_type=subagent_type,
path=invoke_path,
outcome=invoke_outcome,
)
_reraise_stamped_subagent_interrupt(gi, runtime.tool_call_id) _reraise_stamped_subagent_interrupt(gi, runtime.tool_call_id)
except Exception:
invoke_outcome = "error"
sp.set_attribute("subagent.outcome", invoke_outcome)
ot_metrics.record_subagent_invoke_duration(
(time.perf_counter() - invoke_start) * 1000,
subagent_type=subagent_type,
path=invoke_path,
outcome=invoke_outcome,
)
ot_metrics.record_subagent_invoke_outcome(
subagent_type=subagent_type,
path=invoke_path,
outcome=invoke_outcome,
)
raise
invoke_elapsed_ms = (time.perf_counter() - invoke_start) * 1000
ot_metrics.record_subagent_invoke_duration(
invoke_elapsed_ms,
subagent_type=subagent_type,
path=invoke_path,
outcome=invoke_outcome,
)
ot_metrics.record_subagent_invoke_outcome(
subagent_type=subagent_type,
path=invoke_path,
outcome=invoke_outcome,
)
return _return_command_with_state_update(result, runtime.tool_call_id) return _return_command_with_state_update(result, runtime.tool_call_id)
async def atask( async def atask(
@ -274,13 +354,29 @@ def build_task_tool_with_parent_config(
# Prevent the parent's resume payload from leaking into subagent # Prevent the parent's resume payload from leaking into subagent
# interrupts via langgraph's parent_scratchpad fallback. # interrupts via langgraph's parent_scratchpad fallback.
drain_parent_null_resume(runtime) drain_parent_null_resume(runtime)
with ot.subagent_invoke_span(
subagent_type=subagent_type, path=invoke_path
) as sp:
try: try:
result = await subagent.ainvoke( result = await subagent.ainvoke(
build_resume_command(resume_value, pending_id), build_resume_command(resume_value, pending_id),
config=sub_config, config=sub_config,
) )
sp.set_attribute("subagent.outcome", ainvoke_outcome)
except GraphInterrupt as gi: except GraphInterrupt as gi:
ainvoke_outcome = "interrupted" ainvoke_outcome = "interrupted"
sp.set_attribute("subagent.outcome", ainvoke_outcome)
ot_metrics.record_subagent_invoke_duration(
(time.perf_counter() - ainvoke_start) * 1000,
subagent_type=subagent_type,
path=invoke_path,
outcome=ainvoke_outcome,
)
ot_metrics.record_subagent_invoke_outcome(
subagent_type=subagent_type,
path=invoke_path,
outcome=ainvoke_outcome,
)
_perf_log.info( _perf_log.info(
"[hitl_route] atask EXIT subagent_type=%r path=%s outcome=%s " "[hitl_route] atask EXIT subagent_type=%r path=%s outcome=%s "
"aget_state=%.3fs ainvoke=%.3fs total=%.3fs", "aget_state=%.3fs ainvoke=%.3fs total=%.3fs",
@ -292,11 +388,44 @@ def build_task_tool_with_parent_config(
time.perf_counter() - atask_start, time.perf_counter() - atask_start,
) )
_reraise_stamped_subagent_interrupt(gi, runtime.tool_call_id) _reraise_stamped_subagent_interrupt(gi, runtime.tool_call_id)
except Exception:
ainvoke_outcome = "error"
sp.set_attribute("subagent.outcome", ainvoke_outcome)
ot_metrics.record_subagent_invoke_duration(
(time.perf_counter() - ainvoke_start) * 1000,
subagent_type=subagent_type,
path=invoke_path,
outcome=ainvoke_outcome,
)
ot_metrics.record_subagent_invoke_outcome(
subagent_type=subagent_type,
path=invoke_path,
outcome=ainvoke_outcome,
)
raise
else: else:
with ot.subagent_invoke_span(
subagent_type=subagent_type, path=invoke_path
) as sp:
try: try:
result = await subagent.ainvoke(subagent_state, config=sub_config) result = await subagent.ainvoke(
subagent_state, config=sub_config
)
sp.set_attribute("subagent.outcome", ainvoke_outcome)
except GraphInterrupt as gi: except GraphInterrupt as gi:
ainvoke_outcome = "interrupted" ainvoke_outcome = "interrupted"
sp.set_attribute("subagent.outcome", ainvoke_outcome)
ot_metrics.record_subagent_invoke_duration(
(time.perf_counter() - ainvoke_start) * 1000,
subagent_type=subagent_type,
path=invoke_path,
outcome=ainvoke_outcome,
)
ot_metrics.record_subagent_invoke_outcome(
subagent_type=subagent_type,
path=invoke_path,
outcome=ainvoke_outcome,
)
_perf_log.info( _perf_log.info(
"[hitl_route] atask EXIT subagent_type=%r path=%s outcome=%s " "[hitl_route] atask EXIT subagent_type=%r path=%s outcome=%s "
"aget_state=%.3fs ainvoke=%.3fs total=%.3fs", "aget_state=%.3fs ainvoke=%.3fs total=%.3fs",
@ -308,6 +437,21 @@ def build_task_tool_with_parent_config(
time.perf_counter() - atask_start, time.perf_counter() - atask_start,
) )
_reraise_stamped_subagent_interrupt(gi, runtime.tool_call_id) _reraise_stamped_subagent_interrupt(gi, runtime.tool_call_id)
except Exception:
ainvoke_outcome = "error"
sp.set_attribute("subagent.outcome", ainvoke_outcome)
ot_metrics.record_subagent_invoke_duration(
(time.perf_counter() - ainvoke_start) * 1000,
subagent_type=subagent_type,
path=invoke_path,
outcome=ainvoke_outcome,
)
ot_metrics.record_subagent_invoke_outcome(
subagent_type=subagent_type,
path=invoke_path,
outcome=ainvoke_outcome,
)
raise
ainvoke_elapsed = time.perf_counter() - ainvoke_start ainvoke_elapsed = time.perf_counter() - ainvoke_start
except GraphInterrupt: except GraphInterrupt:
raise raise
@ -326,6 +470,17 @@ def build_task_tool_with_parent_config(
merge_elapsed, merge_elapsed,
time.perf_counter() - atask_start, time.perf_counter() - atask_start,
) )
ot_metrics.record_subagent_invoke_duration(
ainvoke_elapsed * 1000,
subagent_type=subagent_type,
path=invoke_path,
outcome=ainvoke_outcome,
)
ot_metrics.record_subagent_invoke_outcome(
subagent_type=subagent_type,
path=invoke_path,
outcome=ainvoke_outcome,
)
return cmd return cmd
return StructuredTool.from_function( return StructuredTool.from_function(

View file

@ -17,6 +17,7 @@ from langchain_core.tools import BaseTool
from langgraph.types import interrupt from langgraph.types import interrupt
from app.agents.new_chat.permissions import Rule from app.agents.new_chat.permissions import Rule
from app.observability import metrics as ot_metrics
from app.observability import otel as ot from app.observability import otel as ot
from .decision import normalize_permission_decision from .decision import normalize_permission_decision
@ -52,6 +53,8 @@ def request_permission_decision(
), ),
ot.interrupt_span(interrupt_type=PERMISSION_ASK_INTERRUPT_TYPE), ot.interrupt_span(interrupt_type=PERMISSION_ASK_INTERRUPT_TYPE),
): ):
ot_metrics.record_permission_ask(permission=tool_name)
ot_metrics.record_interrupt(interrupt_type=PERMISSION_ASK_INTERRUPT_TYPE)
decision = interrupt(payload) decision = interrupt(payload)
return normalize_permission_decision(decision) return normalize_permission_decision(decision)

View file

@ -34,6 +34,7 @@ from deepagents.middleware.summarization import (
) )
from langchain_core.messages import SystemMessage from langchain_core.messages import SystemMessage
from app.observability import metrics as ot_metrics
from app.observability import otel as ot from app.observability import otel as ot
if TYPE_CHECKING: if TYPE_CHECKING:
@ -178,6 +179,7 @@ class SurfSenseCompactionMiddleware(SummarizationMiddleware):
messages_in=len(conversation_messages), messages_in=len(conversation_messages),
extra={"compaction.cutoff_index": int(cutoff_index)}, extra={"compaction.cutoff_index": int(cutoff_index)},
): ):
ot_metrics.record_compaction_run(reason="auto")
messages_to_summarize, preserved_messages = super()._partition_messages( messages_to_summarize, preserved_messages = super()._partition_messages(
conversation_messages, cutoff_index conversation_messages, cutoff_index
) )

View file

@ -47,6 +47,7 @@ from langgraph.config import get_config
from langgraph.runtime import Runtime from langgraph.runtime import Runtime
from langgraph.types import interrupt from langgraph.types import interrupt
from app.observability import metrics as ot_metrics
from app.observability import otel as ot from app.observability import otel as ot
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@ -195,6 +196,7 @@ class DoomLoopMiddleware(AgentMiddleware[AgentState[ResponseT], ContextT, Respon
"interrupt.tool": (action or {}).get("tool", "<unknown>"), "interrupt.tool": (action or {}).get("tool", "<unknown>"),
}, },
): ):
ot_metrics.record_interrupt(interrupt_type="permission_ask")
decision = interrupt( decision = interrupt(
{ {
"type": "permission_ask", "type": "permission_ask",

View file

@ -16,13 +16,14 @@ dashboards expect.
from __future__ import annotations from __future__ import annotations
import logging import logging
import time
from collections.abc import Awaitable, Callable from collections.abc import Awaitable, Callable
from typing import TYPE_CHECKING, Any from typing import TYPE_CHECKING, Any
from langchain.agents.middleware import AgentMiddleware from langchain.agents.middleware import AgentMiddleware
from langchain_core.messages import AIMessage, ToolMessage from langchain_core.messages import AIMessage, ToolMessage
from app.observability import otel as ot from app.observability import metrics as ot_metrics, otel as ot
if TYPE_CHECKING: # pragma: no cover — type-only if TYPE_CHECKING: # pragma: no cover — type-only
from langchain.agents.middleware.types import ( from langchain.agents.middleware.types import (
@ -62,14 +63,37 @@ class OtelSpanMiddleware(AgentMiddleware):
return await handler(request) return await handler(request)
model_id, provider = _resolve_model_attrs(request) model_id, provider = _resolve_model_attrs(request)
t0 = time.perf_counter()
with ot.model_call_span(model_id=model_id, provider=provider) as sp: with ot.model_call_span(model_id=model_id, provider=provider) as sp:
_annotate_model_request(sp, model_id=model_id, provider=provider)
try: try:
result = await handler(request) result = await handler(request)
except Exception: except Exception:
ot_metrics.record_model_call_duration(
(time.perf_counter() - t0) * 1000,
model=model_id,
provider=provider,
)
# span context manager records + re-raises # span context manager records + re-raises
raise raise
else: else:
_annotate_model_response(sp, result) input_tokens, output_tokens = _annotate_model_response(
sp,
result,
model_id=model_id,
provider=provider,
)
ot_metrics.record_model_call_duration(
(time.perf_counter() - t0) * 1000,
model=model_id,
provider=provider,
)
ot_metrics.record_model_token_usage(
input_tokens=input_tokens,
output_tokens=output_tokens,
model=model_id,
provider=provider,
)
return result return result
# ------------------------------------------------------------------ # ------------------------------------------------------------------
@ -87,9 +111,24 @@ class OtelSpanMiddleware(AgentMiddleware):
tool_name = _resolve_tool_name(request) tool_name = _resolve_tool_name(request)
input_size = _resolve_input_size(request) input_size = _resolve_input_size(request)
t0 = time.perf_counter()
with ot.tool_call_span(tool_name, input_size=input_size) as sp: with ot.tool_call_span(tool_name, input_size=input_size) as sp:
try:
result = await handler(request) result = await handler(request)
_annotate_tool_result(sp, result) except Exception:
ot_metrics.record_tool_call_duration(
(time.perf_counter() - t0) * 1000,
tool_name=tool_name,
)
ot_metrics.record_tool_call_error(tool_name=tool_name)
raise
errored = _annotate_tool_result(sp, result)
ot_metrics.record_tool_call_duration(
(time.perf_counter() - t0) * 1000,
tool_name=tool_name,
)
if errored:
ot_metrics.record_tool_call_error(tool_name=tool_name)
return result return result
@ -154,8 +193,29 @@ def _resolve_input_size(request: Any) -> int | None:
return None return None
def _annotate_model_response(span: Any, result: Any) -> None: def _annotate_model_request(
span: Any, *, model_id: str | None, provider: str | None
) -> None:
try:
span.set_attribute("gen_ai.operation.name", "chat")
if model_id:
span.set_attribute("gen_ai.request.model", model_id)
if provider:
span.set_attribute("gen_ai.provider.name", provider)
except Exception: # pragma: no cover — defensive
pass
def _annotate_model_response(
span: Any,
result: Any,
*,
model_id: str | None = None,
provider: str | None = None,
) -> tuple[int | None, int | None]:
"""Best-effort: attach prompt/completion token counts when available.""" """Best-effort: attach prompt/completion token counts when available."""
input_tokens: int | None = None
output_tokens: int | None = None
try: try:
# ModelResponse may be a dataclass with .result containing AIMessage # ModelResponse may be a dataclass with .result containing AIMessage
msg: Any msg: Any
@ -165,22 +225,42 @@ def _annotate_model_response(span: Any, result: Any) -> None:
inner = getattr(result, "result", None) inner = getattr(result, "result", None)
msg = inner[-1] if isinstance(inner, list) and inner else inner msg = inner[-1] if isinstance(inner, list) and inner else inner
if msg is None: if msg is None:
return return None, None
if provider:
span.set_attribute("gen_ai.provider.name", provider)
if model_id:
span.set_attribute("gen_ai.request.model", model_id)
response_model = getattr(msg, "response_metadata", {}) or {}
if isinstance(response_model, dict):
response_model = (
response_model.get("model_name")
or response_model.get("model")
or response_model.get("model_id")
)
if not response_model:
response_model = model_id
if response_model:
span.set_attribute("gen_ai.response.model", str(response_model))
span.set_attribute("gen_ai.operation.name", "chat")
usage = getattr(msg, "usage_metadata", None) or {} usage = getattr(msg, "usage_metadata", None) or {}
if isinstance(usage, dict): if isinstance(usage, dict):
if (n := usage.get("input_tokens")) is not None: if (n := usage.get("input_tokens")) is not None:
span.set_attribute("tokens.prompt", int(n)) input_tokens = int(n)
span.set_attribute("gen_ai.usage.input_tokens", input_tokens)
if (n := usage.get("output_tokens")) is not None: if (n := usage.get("output_tokens")) is not None:
span.set_attribute("tokens.completion", int(n)) output_tokens = int(n)
span.set_attribute("gen_ai.usage.output_tokens", output_tokens)
if (n := usage.get("total_tokens")) is not None: if (n := usage.get("total_tokens")) is not None:
span.set_attribute("tokens.total", int(n)) span.set_attribute("gen_ai.usage.total_tokens", int(n))
tool_calls = getattr(msg, "tool_calls", None) or [] tool_calls = getattr(msg, "tool_calls", None) or []
span.set_attribute("model.tool_calls", len(tool_calls)) span.set_attribute("model.tool_calls", len(tool_calls))
except Exception: # pragma: no cover — defensive except Exception: # pragma: no cover — defensive
pass pass
return input_tokens, output_tokens
def _annotate_tool_result(span: Any, result: Any) -> None: def _annotate_tool_result(span: Any, result: Any) -> bool:
errored = False
try: try:
if isinstance(result, ToolMessage): if isinstance(result, ToolMessage):
content = ( content = (
@ -192,11 +272,14 @@ def _annotate_tool_result(span: Any, result: Any) -> None:
status = getattr(result, "status", None) status = getattr(result, "status", None)
if isinstance(status, str): if isinstance(status, str):
span.set_attribute("tool.status", status) span.set_attribute("tool.status", status)
errored = status.lower() == "error"
kwargs = getattr(result, "additional_kwargs", None) or {} kwargs = getattr(result, "additional_kwargs", None) or {}
if isinstance(kwargs, dict) and kwargs.get("error"): if isinstance(kwargs, dict) and kwargs.get("error"):
span.set_attribute("tool.error", True) span.set_attribute("tool.error", True)
errored = True
except Exception: # pragma: no cover — defensive except Exception: # pragma: no cover — defensive
pass pass
return errored
__all__ = ["OtelSpanMiddleware"] __all__ = ["OtelSpanMiddleware"]

View file

@ -61,6 +61,7 @@ from app.agents.new_chat.permissions import (
aggregate_action, aggregate_action,
evaluate_many, evaluate_many,
) )
from app.observability import metrics as ot_metrics
from app.observability import otel as ot from app.observability import otel as ot
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@ -284,6 +285,8 @@ class PermissionMiddleware(AgentMiddleware): # type: ignore[type-arg]
), ),
ot.interrupt_span(interrupt_type="permission_ask"), ot.interrupt_span(interrupt_type="permission_ask"),
): ):
ot_metrics.record_permission_ask(permission=tool_name)
ot_metrics.record_interrupt(interrupt_type="permission_ask")
decision = interrupt(payload) decision = interrupt(payload)
return _normalize_permission_decision(decision) return _normalize_permission_decision(decision)

View file

@ -45,6 +45,8 @@ from langchain.agents.middleware.types import (
from langchain_core.callbacks import adispatch_custom_event, dispatch_custom_event from langchain_core.callbacks import adispatch_custom_event, dispatch_custom_event
from langchain_core.messages import AIMessage from langchain_core.messages import AIMessage
from app.observability import metrics as ot_metrics, otel as ot
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
# Names of exception classes for which a retry would not help — context # Names of exception classes for which a retry would not help — context
@ -198,6 +200,15 @@ class RetryAfterMiddleware(AgentMiddleware[AgentState[ResponseT], ContextT, Resp
if not self._should_retry(exc) or attempt >= self.max_retries: if not self._should_retry(exc) or attempt >= self.max_retries:
raise raise
delay = self._delay_for_attempt(attempt, exc) delay = self._delay_for_attempt(attempt, exc)
ot.add_event(
"model.retry.scheduled",
{
"retry.attempt": attempt + 1,
"retry.max": self.max_retries,
"retry.delay_ms": int(delay * 1000),
"retry.reason": ot_metrics.categorize_exception(exc),
},
)
try: try:
dispatch_custom_event( dispatch_custom_event(
"surfsense.retrying", "surfsense.retrying",
@ -231,6 +242,15 @@ class RetryAfterMiddleware(AgentMiddleware[AgentState[ResponseT], ContextT, Resp
if not self._should_retry(exc) or attempt >= self.max_retries: if not self._should_retry(exc) or attempt >= self.max_retries:
raise raise
delay = self._delay_for_attempt(attempt, exc) delay = self._delay_for_attempt(attempt, exc)
ot.add_event(
"model.retry.scheduled",
{
"retry.attempt": attempt + 1,
"retry.max": self.max_retries,
"retry.delay_ms": int(delay * 1000),
"retry.reason": ot_metrics.categorize_exception(exc),
},
)
try: try:
await adispatch_custom_event( await adispatch_custom_event(
"surfsense.retrying", "surfsense.retrying",

View file

@ -6,6 +6,8 @@ from typing import TYPE_CHECKING, Any
from langchain.agents.middleware import ModelFallbackMiddleware from langchain.agents.middleware import ModelFallbackMiddleware
from app.observability import metrics as ot_metrics, otel as ot
if TYPE_CHECKING: if TYPE_CHECKING:
from collections.abc import Awaitable, Callable from collections.abc import Awaitable, Callable
@ -55,7 +57,16 @@ class ScopedModelFallbackMiddleware(ModelFallbackMiddleware):
raise raise
last_exception = e last_exception = e
for fallback_model in self.models: for attempt, fallback_model in enumerate(self.models, start=1):
ot.add_event(
"model.fallback",
{
"fallback.attempt": attempt,
"fallback.from": attempt - 1,
"fallback.to": attempt,
"fallback.reason": ot_metrics.categorize_exception(last_exception),
},
)
try: try:
return handler(request.override(model=fallback_model)) return handler(request.override(model=fallback_model))
except Exception as e: except Exception as e:
@ -79,7 +90,16 @@ class ScopedModelFallbackMiddleware(ModelFallbackMiddleware):
raise raise
last_exception = e last_exception = e
for fallback_model in self.models: for attempt, fallback_model in enumerate(self.models, start=1):
ot.add_event(
"model.fallback",
{
"fallback.attempt": attempt,
"fallback.from": attempt - 1,
"fallback.to": attempt,
"fallback.reason": ot_metrics.categorize_exception(last_exception),
},
)
try: try:
return await handler(request.override(model=fallback_model)) return await handler(request.override(model=fallback_model))
except Exception as e: except Exception as e:

View file

@ -1,4 +1,5 @@
import asyncio import asyncio
import contextlib
import gc import gc
import logging import logging
import time import time
@ -36,13 +37,15 @@ from app.config import (
) )
from app.db import User, create_db_and_tables, get_async_session from app.db import User, create_db_and_tables, get_async_session
from app.exceptions import GENERIC_5XX_MESSAGE, ISSUES_URL, SurfSenseError from app.exceptions import GENERIC_5XX_MESSAGE, ISSUES_URL, SurfSenseError
from app.observability import metrics as ot_metrics
from app.observability.bootstrap import init_otel, shutdown_otel
from app.rate_limiter import get_real_client_ip, limiter from app.rate_limiter import get_real_client_ip, limiter
from app.routes import router as crud_router from app.routes import router as crud_router
from app.routes.auth_routes import router as auth_router from app.routes.auth_routes import router as auth_router
from app.schemas import UserCreate, UserRead, UserUpdate from app.schemas import UserCreate, UserRead, UserUpdate
from app.tasks.surfsense_docs_indexer import seed_surfsense_docs from app.tasks.surfsense_docs_indexer import seed_surfsense_docs
from app.users import SECRET, auth_backend, current_active_user, fastapi_users from app.users import SECRET, auth_backend, current_active_user, fastapi_users
from app.utils.perf import get_perf_logger, log_system_snapshot from app.utils.perf import log_system_snapshot
_error_logger = logging.getLogger("surfsense.errors") _error_logger = logging.getLogger("surfsense.errors")
@ -127,6 +130,8 @@ def _http_exception_handler(request: Request, exc: HTTPException) -> JSONRespons
logged server-side. logged server-side.
""" """
rid = _get_request_id(request) rid = _get_request_id(request)
if exc.status_code in {401, 403} and request.url.path.startswith("/auth"):
ot_metrics.record_auth_failure(reason=_status_to_code(exc.status_code))
should_sanitize = exc.status_code == 500 should_sanitize = exc.status_code == 500
# Structured dict details (e.g. {"code": "CAPTCHA_REQUIRED", "message": "..."}) # Structured dict details (e.g. {"code": "CAPTCHA_REQUIRED", "message": "..."})
@ -213,6 +218,7 @@ def _validation_error_handler(
def _unhandled_exception_handler(request: Request, exc: Exception) -> JSONResponse: def _unhandled_exception_handler(request: Request, exc: Exception) -> JSONResponse:
"""Catch-all: log full traceback, return sanitized 500.""" """Catch-all: log full traceback, return sanitized 500."""
rid = _get_request_id(request) rid = _get_request_id(request)
ot_metrics.record_auth_failure(reason="unhandled_exception")
_error_logger.error( _error_logger.error(
"[%s] Unhandled exception on %s %s", "[%s] Unhandled exception on %s %s",
rid, rid,
@ -246,6 +252,7 @@ def _status_to_code(status_code: int, detail: str = "") -> str:
def _rate_limit_exceeded_handler(request: Request, exc: RateLimitExceeded): def _rate_limit_exceeded_handler(request: Request, exc: RateLimitExceeded):
"""Custom 429 handler that returns JSON matching our error envelope.""" """Custom 429 handler that returns JSON matching our error envelope."""
rid = _get_request_id(request) rid = _get_request_id(request)
ot_metrics.record_rate_limit_rejection(scope="slowapi")
retry_after = exc.detail.split("per")[-1].strip() if exc.detail else "60" retry_after = exc.detail.split("per")[-1].strip() if exc.detail else "60"
return _build_error_response( return _build_error_response(
429, 429,
@ -306,6 +313,7 @@ def _check_rate_limit_memory(
f"Rate limit exceeded (in-memory fallback) on {scope} for IP {client_ip} " f"Rate limit exceeded (in-memory fallback) on {scope} for IP {client_ip} "
f"({len(timestamps)}/{max_requests} in {window_seconds}s)" f"({len(timestamps)}/{max_requests} in {window_seconds}s)"
) )
ot_metrics.record_rate_limit_rejection(scope=scope)
raise HTTPException( raise HTTPException(
status_code=429, status_code=429,
detail="RATE_LIMIT_EXCEEDED", detail="RATE_LIMIT_EXCEEDED",
@ -349,6 +357,7 @@ def _check_rate_limit(
f"Rate limit exceeded on {scope} for IP {client_ip} " f"Rate limit exceeded on {scope} for IP {client_ip} "
f"({current_count}/{max_requests} in {window_seconds}s)" f"({current_count}/{max_requests} in {window_seconds}s)"
) )
ot_metrics.record_rate_limit_rejection(scope=scope)
raise HTTPException( raise HTTPException(
status_code=429, status_code=429,
detail="RATE_LIMIT_EXCEEDED", detail="RATE_LIMIT_EXCEEDED",
@ -558,6 +567,7 @@ async def lifespan(app: FastAPI):
gc.set_threshold(700, 10, 5) gc.set_threshold(700, 10, 5)
_enable_slow_callback_logging(threshold_sec=0.5) _enable_slow_callback_logging(threshold_sec=0.5)
init_otel(app)
await create_db_and_tables() await create_db_and_tables()
await setup_checkpointer_tables() await setup_checkpointer_tables()
initialize_openrouter_integration() initialize_openrouter_integration()
@ -592,6 +602,7 @@ async def lifespan(app: FastAPI):
_stop_openrouter_background_refresh() _stop_openrouter_background_refresh()
await close_checkpointer() await close_checkpointer()
shutdown_otel()
def registration_allowed(): def registration_allowed():
@ -676,32 +687,20 @@ class RequestPerfMiddleware(BaseHTTPMiddleware):
async def dispatch( async def dispatch(
self, request: StarletteRequest, call_next: RequestResponseEndpoint self, request: StarletteRequest, call_next: RequestResponseEndpoint
) -> StarletteResponse: ) -> StarletteResponse:
perf = get_perf_logger()
t0 = time.perf_counter() t0 = time.perf_counter()
response = await call_next(request) response = await call_next(request)
elapsed_ms = (time.perf_counter() - t0) * 1000 elapsed_ms = (time.perf_counter() - t0) * 1000
path = request.url.path path = request.url.path
method = request.method
status = response.status_code
perf.debug(
"[request] %s %s -> %d in %.1fms",
method,
path,
status,
elapsed_ms,
)
if elapsed_ms > _PERF_SLOW_REQUEST_THRESHOLD: if elapsed_ms > _PERF_SLOW_REQUEST_THRESHOLD:
perf.warning( with contextlib.suppress(Exception):
"[SLOW_REQUEST] %s %s -> %d in %.1fms (threshold=%.0fms)", from opentelemetry import trace
method,
path, span = trace.get_current_span()
status, span.set_attribute("slow_request", True)
elapsed_ms, span.set_attribute("surfsense.request.elapsed_ms", elapsed_ms)
_PERF_SLOW_REQUEST_THRESHOLD, span.set_attribute("http.route", path)
)
log_system_snapshot("slow_request") log_system_snapshot("slow_request")
return response return response

View file

@ -1,16 +1,103 @@
"""Celery application configuration and setup.""" """Celery application configuration and setup."""
import contextlib
import os import os
import time
from celery import Celery from celery import Celery
from celery.schedules import crontab from celery.schedules import crontab
from celery.signals import worker_process_init from celery.signals import (
before_task_publish,
task_postrun,
task_prerun,
worker_process_init,
)
from dotenv import load_dotenv from dotenv import load_dotenv
try:
from opentelemetry import trace
except ImportError: # pragma: no cover - optional OTel dependency
trace = None # type: ignore[assignment]
# Load environment variables # Load environment variables
load_dotenv() load_dotenv()
@before_task_publish.connect
def _stamp_enqueue_time(headers=None, **_kwargs):
"""Stamp enqueue time so workers can measure queue wait."""
if headers is None:
return
with contextlib.suppress(Exception):
headers["surfsense.enqueued_at_ns"] = str(time.monotonic_ns())
@task_prerun.connect
def _record_queue_latency(task=None, **_kwargs):
"""Record queue latency and stash task metadata for span enrichment."""
if task is None:
return
try:
from app.observability import metrics as ot_metrics
task_name = getattr(task, "name", None) or "unknown"
operation = ot_metrics.parse_celery_task_label(task_name)
request = getattr(task, "request", None)
delivery_info = getattr(request, "delivery_info", None) or {}
queue = delivery_info.get("routing_key") or "unknown"
scheduled = bool(
getattr(request, "eta", None) or getattr(request, "expires", None)
)
with contextlib.suppress(Exception):
request.surfsense_operation = operation
request.surfsense_queue = queue
request.surfsense_scheduled = scheduled
headers = getattr(request, "headers", None) or {}
enqueued_ns = headers.get("surfsense.enqueued_at_ns")
if enqueued_ns is None:
return
elapsed_s = (time.monotonic_ns() - int(enqueued_ns)) / 1e9
with contextlib.suppress(Exception):
request.surfsense_queue_latency_ms = elapsed_s * 1000
ot_metrics.record_celery_queue_latency(
elapsed_s,
task_name=task_name,
queue=queue,
scheduled=scheduled,
operation=operation,
)
except Exception:
pass
@task_postrun.connect
def _set_celery_span_attributes(task=None, **_kwargs):
"""Attach derived queue metadata to the active Celery run span."""
if task is None or trace is None:
return
try:
request = getattr(task, "request", None)
if request is None:
return
span = trace.get_current_span()
operation = getattr(request, "surfsense_operation", None)
if operation:
span.set_attribute("celery.task.operation", operation)
latency_ms = getattr(request, "surfsense_queue_latency_ms", None)
if latency_ms is not None:
span.set_attribute("celery.queue.latency_ms", latency_ms)
except Exception:
pass
@worker_process_init.connect @worker_process_init.connect
def init_worker(**kwargs): def init_worker(**kwargs):
"""Initialize the LLM Router and Image Gen Router when a Celery worker process starts. """Initialize the LLM Router and Image Gen Router when a Celery worker process starts.
@ -18,6 +105,10 @@ def init_worker(**kwargs):
This ensures the Auto mode (LiteLLM Router) is available for background tasks This ensures the Auto mode (LiteLLM Router) is available for background tasks
like document summarization and image generation. like document summarization and image generation.
""" """
from app.observability.bootstrap import init_otel
init_otel(app=None, traces=True, metrics=True, logs=True)
from app.config import ( from app.config import (
initialize_image_gen_router, initialize_image_gen_router,
initialize_llm_router, initialize_llm_router,

View file

@ -1,4 +1,7 @@
import contextlib
import logging import logging
import time
from pathlib import PurePosixPath
from app.config import config as app_config from app.config import config as app_config
from app.etl_pipeline.etl_document import EtlRequest, EtlResult from app.etl_pipeline.etl_document import EtlRequest, EtlResult
@ -10,6 +13,11 @@ from app.etl_pipeline.file_classifier import FileCategory, classify_file
from app.etl_pipeline.parsers.audio import transcribe_audio from app.etl_pipeline.parsers.audio import transcribe_audio
from app.etl_pipeline.parsers.direct_convert import convert_file_directly from app.etl_pipeline.parsers.direct_convert import convert_file_directly
from app.etl_pipeline.parsers.plaintext import read_plaintext from app.etl_pipeline.parsers.plaintext import read_plaintext
from app.observability import metrics as ot_metrics, otel as ot
def _file_extension(filename: str) -> str:
return PurePosixPath(filename).suffix.lower() or "none"
class EtlPipelineService: class EtlPipelineService:
@ -20,7 +28,16 @@ class EtlPipelineService:
async def extract(self, request: EtlRequest) -> EtlResult: async def extract(self, request: EtlRequest) -> EtlResult:
category = classify_file(request.filename) category = classify_file(request.filename)
start = time.perf_counter()
status = "success"
error_category: str | None = None
result: EtlResult | None = None
with ot.etl_extract_span(
content_type=category.value,
file_extension=_file_extension(request.filename),
processing_mode=request.processing_mode.value,
) as sp:
try:
if category == FileCategory.UNSUPPORTED: if category == FileCategory.UNSUPPORTED:
raise EtlUnsupportedFileError( raise EtlUnsupportedFileError(
f"File type not supported for parsing: {request.filename}" f"File type not supported for parsing: {request.filename}"
@ -28,41 +45,74 @@ class EtlPipelineService:
if category == FileCategory.PLAINTEXT: if category == FileCategory.PLAINTEXT:
content = read_plaintext(request.file_path) content = read_plaintext(request.file_path)
return EtlResult( result = EtlResult(
markdown_content=content, markdown_content=content,
etl_service="PLAINTEXT", etl_service="PLAINTEXT",
content_type="plaintext", content_type="plaintext",
) )
return result
if category == FileCategory.DIRECT_CONVERT: if category == FileCategory.DIRECT_CONVERT:
content = convert_file_directly(request.file_path, request.filename) content = convert_file_directly(request.file_path, request.filename)
return EtlResult( result = EtlResult(
markdown_content=content, markdown_content=content,
etl_service="DIRECT_CONVERT", etl_service="DIRECT_CONVERT",
content_type="direct_convert", content_type="direct_convert",
) )
return result
if category == FileCategory.AUDIO: if category == FileCategory.AUDIO:
content = await transcribe_audio(request.file_path, request.filename) content = await transcribe_audio(request.file_path, request.filename)
return EtlResult( result = EtlResult(
markdown_content=content, markdown_content=content,
etl_service="AUDIO", etl_service="AUDIO",
content_type="audio", content_type="audio",
) )
return result
if category == FileCategory.IMAGE: if category == FileCategory.IMAGE:
return await self._extract_image(request) result = await self._extract_image(request)
return result
return await self._extract_document(request) result = await self._extract_document(request)
return result
except Exception as exc:
status = "error"
error_category = ot_metrics.categorize_exception(exc)
raise
finally:
with contextlib.suppress(Exception):
if result is not None:
sp.set_attribute("etl.service", result.etl_service)
sp.set_attribute("content.type", result.content_type)
sp.set_attribute("etl.status", status)
ot_metrics.record_etl_extract_duration(
time.perf_counter() - start,
etl_service=result.etl_service if result else None,
content_type=result.content_type if result else category.value,
status=status,
)
ot_metrics.record_etl_extract_outcome(
etl_service=result.etl_service if result else None,
content_type=result.content_type if result else category.value,
status=status,
error_category=error_category,
)
async def _extract_image(self, request: EtlRequest) -> EtlResult: async def _extract_image(self, request: EtlRequest) -> EtlResult:
if self._vision_llm: if self._vision_llm:
try: try:
from app.etl_pipeline.parsers.vision_llm import parse_with_vision_llm from app.etl_pipeline.parsers.vision_llm import parse_with_vision_llm
with ot.etl_parse_span(
etl_service="VISION_LLM",
content_type="image",
file_extension=_file_extension(request.filename),
) as sp:
content = await parse_with_vision_llm( content = await parse_with_vision_llm(
request.file_path, request.filename, self._vision_llm request.file_path, request.filename, self._vision_llm
) )
sp.set_attribute("etl.status", "success")
return EtlResult( return EtlResult(
markdown_content=content, markdown_content=content,
etl_service="VISION_LLM", etl_service="VISION_LLM",
@ -87,13 +137,33 @@ class EtlPipelineService:
request.filename, request.filename,
exc_info=True, exc_info=True,
) )
ot.add_event(
"etl.fallback",
{
"fallback.from": "vision_llm",
"fallback.to": "document_parser",
"fallback.reason": ot_metrics.categorize_exception(exc),
},
)
else: else:
logging.info( logging.info(
"No vision LLM provided, falling back to document parser for %s", "No vision LLM provided, falling back to document parser for %s",
request.filename, request.filename,
) )
ot.add_event(
"etl.fallback",
{
"fallback.from": "vision_llm",
"fallback.to": "document_parser",
"fallback.reason": "not_configured",
},
)
try: try:
with ot.etl_ocr_span(
etl_service=app_config.ETL_SERVICE,
file_extension=_file_extension(request.filename),
):
return await self._extract_document(request) return await self._extract_document(request)
except (EtlUnsupportedFileError, EtlServiceUnavailableError): except (EtlUnsupportedFileError, EtlServiceUnavailableError):
raise EtlUnsupportedFileError( raise EtlUnsupportedFileError(
@ -121,18 +191,27 @@ class EtlPipelineService:
f"File type {ext} is not supported by {etl_service}" f"File type {ext} is not supported by {etl_service}"
) )
with ot.etl_parse_span(
etl_service=etl_service,
content_type="document",
file_extension=ext,
processing_mode=request.processing_mode.value,
) as sp:
if etl_service == "DOCLING": if etl_service == "DOCLING":
from app.etl_pipeline.parsers.docling import parse_with_docling from app.etl_pipeline.parsers.docling import parse_with_docling
content = await parse_with_docling(request.file_path, request.filename) content = await parse_with_docling(request.file_path, request.filename)
elif etl_service == "UNSTRUCTURED": elif etl_service == "UNSTRUCTURED":
from app.etl_pipeline.parsers.unstructured import parse_with_unstructured from app.etl_pipeline.parsers.unstructured import (
parse_with_unstructured,
)
content = await parse_with_unstructured(request.file_path) content = await parse_with_unstructured(request.file_path)
elif etl_service == "LLAMACLOUD": elif etl_service == "LLAMACLOUD":
content = await self._extract_with_llamacloud(request) content = await self._extract_with_llamacloud(request)
else: else:
raise EtlServiceUnavailableError(f"Unknown ETL_SERVICE: {etl_service}") raise EtlServiceUnavailableError(f"Unknown ETL_SERVICE: {etl_service}")
sp.set_attribute("etl.status", "success")
# When the operator opts into vision-LLM at ingest, walk the # When the operator opts into vision-LLM at ingest, walk the
# original file's embedded images and append a structured # original file's embedded images and append a structured
@ -171,9 +250,14 @@ class EtlPipelineService:
async def _ocr_image(image_path: str, image_name: str) -> str: async def _ocr_image(image_path: str, image_name: str) -> str:
try: try:
sub = EtlPipelineService(vision_llm=None) sub = EtlPipelineService(vision_llm=None)
with ot.etl_picture_ocr_span(
file_extension=_file_extension(image_name)
) as sp:
ocr_result = await sub.extract( ocr_result = await sub.extract(
EtlRequest(file_path=image_path, filename=image_name) EtlRequest(file_path=image_path, filename=image_name)
) )
sp.set_attribute("etl.service", ocr_result.etl_service)
sp.set_attribute("etl.status", "success")
except ( except (
EtlUnsupportedFileError, EtlUnsupportedFileError,
EtlServiceUnavailableError, EtlServiceUnavailableError,
@ -181,20 +265,42 @@ class EtlPipelineService:
# Common case: the configured ETL service can't OCR # Common case: the configured ETL service can't OCR
# this image format (or no service is configured at # this image format (or no service is configured at
# all). Don't spam warnings -- just no OCR for it. # all). Don't spam warnings -- just no OCR for it.
ot.add_event(
"etl.ocr.skipped",
{
"skip.reason": "unsupported_format",
"error.category": ot_metrics.categorize_exception(exc),
},
)
logging.debug("Skipping per-image OCR for %s: %s", image_name, exc) logging.debug("Skipping per-image OCR for %s: %s", image_name, exc)
return "" return ""
return ocr_result.markdown_content return ocr_result.markdown_content
try: try:
with ot.etl_picture_describe_span() as sp:
result = await describe_pictures( result = await describe_pictures(
request.file_path, request.file_path,
request.filename, request.filename,
self._vision_llm, self._vision_llm,
ocr_runner=_ocr_image, ocr_runner=_ocr_image,
) )
except Exception: sp.set_attribute("image.described.count", len(result.descriptions))
sp.set_attribute("image.failed.count", result.failed)
sp.set_attribute("image.skipped.too_small", result.skipped_too_small)
sp.set_attribute("image.skipped.too_large", result.skipped_too_large)
sp.set_attribute("image.skipped.duplicate", result.skipped_duplicate)
sp.set_attribute("etl.status", "success")
except Exception as exc:
# Picture description is additive; never let it fail an # Picture description is additive; never let it fail an
# otherwise-successful document extraction. # otherwise-successful document extraction.
ot.add_event(
"etl.degraded",
{
"degraded.reason": "picture_describe_failed",
"degraded.action": "return_parser_output",
"error.category": ot_metrics.categorize_exception(exc),
},
)
logging.warning( logging.warning(
"Picture description failed for %s, returning parser output unchanged", "Picture description failed for %s, returning parser output unchanged",
request.filename, request.filename,
@ -247,7 +353,15 @@ class EtlPipelineService:
return await parse_with_azure_doc_intelligence( return await parse_with_azure_doc_intelligence(
request.file_path, processing_mode=mode_value request.file_path, processing_mode=mode_value
) )
except Exception: except Exception as exc:
ot.add_event(
"etl.fallback",
{
"fallback.from": "azure_di",
"fallback.to": "llamacloud",
"fallback.reason": ot_metrics.categorize_exception(exc),
},
)
logging.warning( logging.warning(
"Azure Document Intelligence failed for %s, " "Azure Document Intelligence failed for %s, "
"falling back to LlamaCloud", "falling back to LlamaCloud",

View file

@ -2,6 +2,7 @@ import asyncio
import contextlib import contextlib
import hashlib import hashlib
import logging import logging
import sys
import time import time
from collections.abc import Awaitable, Callable from collections.abc import Awaitable, Callable
from dataclasses import dataclass, field from dataclasses import dataclass, field
@ -57,6 +58,7 @@ from app.indexing_pipeline.pipeline_logger import (
log_retryable_llm_error, log_retryable_llm_error,
log_unexpected_error, log_unexpected_error,
) )
from app.observability import metrics as ot_metrics, otel as ot
from app.utils.perf import get_perf_logger from app.utils.perf import get_perf_logger
@ -362,6 +364,16 @@ class IndexingPipelineService:
) )
perf = get_perf_logger() perf = get_perf_logger()
t_index = time.perf_counter() t_index = time.perf_counter()
document_type = (
document.document_type.value
if getattr(document, "document_type", None)
else None
)
persist_span_cm = ot.kb_persist_span(
document_type=document_type,
)
persist_span = persist_span_cm.__enter__()
outcome_status = "failed"
try: try:
log_index_started(ctx) log_index_started(ctx)
document.status = DocumentStatus.processing() document.status = DocumentStatus.processing()
@ -429,34 +441,41 @@ class IndexingPipelineService:
time.perf_counter() - t_index, time.perf_counter() - t_index,
) )
log_index_success(ctx, chunk_count=len(chunks)) log_index_success(ctx, chunk_count=len(chunks))
outcome_status = "success"
await self._enqueue_ai_sort_if_enabled(document) await self._enqueue_ai_sort_if_enabled(document)
except RETRYABLE_LLM_ERRORS as e: except RETRYABLE_LLM_ERRORS as e:
ot.record_error(persist_span, e)
log_retryable_llm_error(ctx, e) log_retryable_llm_error(ctx, e)
outcome_status = "requeued"
await rollback_and_persist_failure( await rollback_and_persist_failure(
self.session, document, llm_retryable_message(e) self.session, document, llm_retryable_message(e)
) )
except PERMANENT_LLM_ERRORS as e: except PERMANENT_LLM_ERRORS as e:
ot.record_error(persist_span, e)
log_permanent_llm_error(ctx, e) log_permanent_llm_error(ctx, e)
await rollback_and_persist_failure( await rollback_and_persist_failure(
self.session, document, llm_permanent_message(e) self.session, document, llm_permanent_message(e)
) )
except RecursionError as e: except RecursionError as e:
ot.record_error(persist_span, e)
log_chunking_overflow(ctx, e) log_chunking_overflow(ctx, e)
await rollback_and_persist_failure( await rollback_and_persist_failure(
self.session, document, PipelineMessages.CHUNKING_OVERFLOW self.session, document, PipelineMessages.CHUNKING_OVERFLOW
) )
except EMBEDDING_ERRORS as e: except EMBEDDING_ERRORS as e:
ot.record_error(persist_span, e)
log_embedding_error(ctx, e) log_embedding_error(ctx, e)
await rollback_and_persist_failure( await rollback_and_persist_failure(
self.session, document, embedding_message(e) self.session, document, embedding_message(e)
) )
except Exception as e: except Exception as e:
ot.record_error(persist_span, e)
log_unexpected_error(ctx, e) log_unexpected_error(ctx, e)
await rollback_and_persist_failure( await rollback_and_persist_failure(
self.session, document, safe_exception_message(e) self.session, document, safe_exception_message(e)
@ -465,6 +484,17 @@ class IndexingPipelineService:
with contextlib.suppress(Exception): with contextlib.suppress(Exception):
await self.session.refresh(document) await self.session.refresh(document)
with contextlib.suppress(Exception):
persist_span.set_attribute("indexing.status", outcome_status)
ot_metrics.record_indexing_document_duration(
time.perf_counter() - t_index,
document_type=document_type,
)
ot_metrics.record_indexing_document_outcome(
document_type=document_type,
status=outcome_status,
)
persist_span_cm.__exit__(*sys.exc_info())
return document return document
async def _enqueue_ai_sort_if_enabled(self, document: Document) -> None: async def _enqueue_ai_sort_if_enabled(self, document: Document) -> None:

View file

@ -5,3 +5,5 @@ small wrapper around the optional ``opentelemetry`` instrumentation. The
wrapper is a no-op when OTEL is not configured, so importing it from wrapper is a no-op when OTEL is not configured, so importing it from
performance-critical paths is safe. performance-critical paths is safe.
""" """
__all__ = ["bootstrap", "metrics", "otel"]

View file

@ -0,0 +1,390 @@
"""Programmatic OpenTelemetry bootstrap for SurfSense backend processes."""
from __future__ import annotations
import contextlib
import logging
import os
import socket
from importlib import metadata
from typing import Any
from urllib.parse import urlsplit, urlunsplit
from app.observability import otel
logger = logging.getLogger(__name__)
_BOOL_TRUE = {"1", "true", "yes", "on"}
_TRACES_INITIALIZED = False
_METRICS_INITIALIZED = False
_LOGS_INITIALIZED = False
_FASTAPI_INSTRUMENTED = False
_SQLALCHEMY_INSTRUMENTED = False
_PSYCOPG_INSTRUMENTED = False
_REDIS_INSTRUMENTED = False
_HTTPX_INSTRUMENTED = False
_CELERY_INSTRUMENTED = False
_TRACER_PROVIDER: Any | None = None
_METER_PROVIDER: Any | None = None
def _env_truthy(name: str) -> bool:
return os.environ.get(name, "").strip().lower() in _BOOL_TRUE
def is_otel_disabled() -> bool:
"""Return true when either SurfSense or OTel's spec kill switch is set."""
return _env_truthy("SURFSENSE_DISABLE_OTEL") or _env_truthy("OTEL_SDK_DISABLED")
def is_otel_configured() -> bool:
"""Return true when this process should export OTel signals."""
return bool(
os.environ.get("OTEL_EXPORTER_OTLP_ENDPOINT")
or os.environ.get("OTEL_EXPORTER_OTLP_TRACES_ENDPOINT")
or os.environ.get("OTEL_EXPORTER_OTLP_METRICS_ENDPOINT")
)
def _package_version() -> str:
with contextlib.suppress(metadata.PackageNotFoundError):
return metadata.version("surf-new-backend")
return "unknown"
def _deployment_environment() -> str:
return os.environ.get("SURFSENSE_ENV", "dev")
def _build_resource():
from opentelemetry.sdk.resources import Resource
deployment_environment = _deployment_environment()
return Resource.create(
{
"service.name": os.environ.get("OTEL_SERVICE_NAME", "surfsense-backend"),
"service.version": _package_version(),
"service.instance.id": socket.gethostname(),
"deployment.environment.name": deployment_environment,
# Compatibility alias for Grafana onboarding checks that still use
# the older semantic-convention key.
"deployment.environment": deployment_environment,
}
)
def _otlp_protocol() -> str:
return os.environ.get("OTEL_EXPORTER_OTLP_PROTOCOL", "grpc").strip().lower()
def _trace_exporter():
if _otlp_protocol() == "http/protobuf":
from opentelemetry.exporter.otlp.proto.http.trace_exporter import (
OTLPSpanExporter,
)
endpoint = os.environ.get("OTEL_EXPORTER_OTLP_TRACES_ENDPOINT")
return OTLPSpanExporter(endpoint=endpoint) if endpoint else OTLPSpanExporter()
from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter
endpoint = os.environ.get("OTEL_EXPORTER_OTLP_TRACES_ENDPOINT")
return OTLPSpanExporter(endpoint=endpoint) if endpoint else OTLPSpanExporter()
def _metric_exporter():
if _otlp_protocol() == "http/protobuf":
from opentelemetry.exporter.otlp.proto.http.metric_exporter import (
OTLPMetricExporter,
)
endpoint = os.environ.get("OTEL_EXPORTER_OTLP_METRICS_ENDPOINT")
return (
OTLPMetricExporter(endpoint=endpoint) if endpoint else OTLPMetricExporter()
)
from opentelemetry.exporter.otlp.proto.grpc.metric_exporter import (
OTLPMetricExporter,
)
endpoint = os.environ.get("OTEL_EXPORTER_OTLP_METRICS_ENDPOINT")
return OTLPMetricExporter(endpoint=endpoint) if endpoint else OTLPMetricExporter()
def _safe_instrument(name: str, instrument: Any) -> bool:
try:
instrument()
except Exception:
logger.warning("OpenTelemetry %s instrumentation failed", name, exc_info=True)
return False
return True
def _url_without_query(raw_url: Any) -> str | None:
try:
parts = urlsplit(str(raw_url))
except Exception:
return None
if not parts.scheme or not parts.netloc:
return None
return urlunsplit((parts.scheme, parts.netloc, parts.path or "/", "", ""))
def _sanitize_http_span_url(span: Any, request: Any) -> None:
sanitized = _url_without_query(getattr(request, "url", None))
if not sanitized:
return
with contextlib.suppress(Exception):
# Keep both old and current semantic-convention names safe. The
# collector can drop one later without needing application changes.
span.set_attribute("http.url", sanitized)
span.set_attribute("url.full", sanitized)
def _instrument_fastapi(app: Any | None) -> None:
global _FASTAPI_INSTRUMENTED
if app is None or _FASTAPI_INSTRUMENTED:
return
def _run() -> None:
from opentelemetry.instrumentation.fastapi import FastAPIInstrumentor
FastAPIInstrumentor.instrument_app(
app,
excluded_urls="/health,/ready,/metrics",
)
if _safe_instrument("FastAPI", _run):
_FASTAPI_INSTRUMENTED = True
def instrument_sqlalchemy_engine(engine: Any) -> None:
"""Instrument a SQLAlchemy engine once per process."""
global _SQLALCHEMY_INSTRUMENTED
if _SQLALCHEMY_INSTRUMENTED:
return
def _run() -> None:
from opentelemetry.instrumentation.sqlalchemy import SQLAlchemyInstrumentor
SQLAlchemyInstrumentor().instrument(
engine=getattr(engine, "sync_engine", engine),
enable_commenter=True,
)
if _safe_instrument("SQLAlchemy", _run):
_SQLALCHEMY_INSTRUMENTED = True
def _instrument_sqlalchemy() -> None:
if _SQLALCHEMY_INSTRUMENTED:
return
with contextlib.suppress(Exception):
from app.db import engine
instrument_sqlalchemy_engine(engine)
def _instrument_psycopg() -> None:
global _PSYCOPG_INSTRUMENTED
if _PSYCOPG_INSTRUMENTED:
return
def _run() -> None:
from opentelemetry.instrumentation.psycopg import PsycopgInstrumentor
PsycopgInstrumentor().instrument()
if _safe_instrument("psycopg", _run):
_PSYCOPG_INSTRUMENTED = True
def _instrument_redis() -> None:
global _REDIS_INSTRUMENTED
if _REDIS_INSTRUMENTED:
return
def _run() -> None:
from opentelemetry.instrumentation.redis import RedisInstrumentor
RedisInstrumentor().instrument()
if _safe_instrument("Redis", _run):
_REDIS_INSTRUMENTED = True
def _instrument_httpx() -> None:
global _HTTPX_INSTRUMENTED
if _HTTPX_INSTRUMENTED:
return
def _run() -> None:
from opentelemetry.instrumentation.httpx import HTTPXClientInstrumentor
HTTPXClientInstrumentor().instrument(
request_hook=lambda span, request: _sanitize_http_span_url(span, request),
response_hook=lambda span, request, _response: _sanitize_http_span_url(
span, request
),
)
if _safe_instrument("HTTPX", _run):
_HTTPX_INSTRUMENTED = True
def instrument_celery() -> None:
"""Instrument Celery producer/consumer hooks once per process."""
global _CELERY_INSTRUMENTED
if _CELERY_INSTRUMENTED:
return
def _run() -> None:
from opentelemetry.instrumentation.celery import CeleryInstrumentor
CeleryInstrumentor().instrument()
if _safe_instrument("Celery", _run):
_CELERY_INSTRUMENTED = True
def _instrument_libraries(app: Any | None) -> None:
_instrument_fastapi(app)
_instrument_sqlalchemy()
_instrument_psycopg()
_instrument_redis()
_instrument_httpx()
instrument_celery()
def init_traces(app: Any | None = None) -> None:
"""Install the tracer provider, span processor, exporter, and instrumentors."""
global _TRACER_PROVIDER, _TRACES_INITIALIZED
if _TRACES_INITIALIZED:
_instrument_fastapi(app)
return
from opentelemetry import trace
from opentelemetry.sdk.trace import TracerProvider
from opentelemetry.sdk.trace.export import BatchSpanProcessor
from opentelemetry.sdk.trace.sampling import ALWAYS_ON, ParentBased
provider = TracerProvider(
resource=_build_resource(),
sampler=ParentBased(ALWAYS_ON),
)
provider.add_span_processor(BatchSpanProcessor(_trace_exporter()))
try:
trace.set_tracer_provider(provider)
except Exception:
logger.warning(
"OpenTelemetry tracer provider was already set; reusing existing provider",
exc_info=True,
)
_TRACER_PROVIDER = trace.get_tracer_provider()
else:
_TRACER_PROVIDER = provider
_TRACES_INITIALIZED = True
otel.reload_for_tests()
_instrument_libraries(app)
def init_metrics() -> None:
"""Install the meter provider, metric reader, exporter, and custom gauges."""
global _METER_PROVIDER, _METRICS_INITIALIZED
if _METRICS_INITIALIZED:
return
from opentelemetry import metrics
from opentelemetry.sdk.metrics import MeterProvider
from opentelemetry.sdk.metrics.export import PeriodicExportingMetricReader
interval_ms = int(os.environ.get("OTEL_METRIC_EXPORT_INTERVAL", "60000"))
reader = PeriodicExportingMetricReader(
_metric_exporter(),
export_interval_millis=interval_ms,
)
provider = MeterProvider(metric_readers=[reader], resource=_build_resource())
try:
metrics.set_meter_provider(provider)
except Exception:
logger.warning(
"OpenTelemetry meter provider was already set; reusing existing provider",
exc_info=True,
)
_METER_PROVIDER = metrics.get_meter_provider()
else:
_METER_PROVIDER = provider
_METRICS_INITIALIZED = True
from app.observability.metrics import register_runtime_observables
register_runtime_observables()
def init_logs() -> None:
"""Enable trace/span correlation fields on stdlib LogRecords."""
global _LOGS_INITIALIZED
if _LOGS_INITIALIZED:
return
def _run() -> None:
from opentelemetry.instrumentation.logging import LoggingInstrumentor
# Required for stdlib LogRecords to receive otelTraceID/otelSpanID.
# logging.basicConfig is already installed by main.py, so this does not
# take over formatting in normal app startup.
LoggingInstrumentor().instrument(set_logging_format=True)
if _safe_instrument("logging", _run):
_LOGS_INITIALIZED = True
def init_otel(
app: Any | None = None,
*,
traces: bool = True,
metrics: bool = True,
logs: bool = True,
) -> None:
"""Initialize OpenTelemetry for a FastAPI or Celery process."""
if is_otel_disabled() or not is_otel_configured():
otel.reload_for_tests()
return
if traces:
init_traces(app)
if metrics:
init_metrics()
if logs:
init_logs()
def shutdown_otel(timeout_millis: int = 5000) -> None:
"""Best-effort flush and shutdown for installed providers."""
for provider in (_TRACER_PROVIDER, _METER_PROVIDER):
if provider is None:
continue
with contextlib.suppress(Exception):
provider.force_flush(timeout_millis=timeout_millis)
with contextlib.suppress(Exception):
provider.shutdown()
__all__ = [
"_BOOL_TRUE",
"_build_resource",
"init_logs",
"init_metrics",
"init_otel",
"init_traces",
"instrument_celery",
"instrument_sqlalchemy_engine",
"is_otel_configured",
"is_otel_disabled",
"shutdown_otel",
]

View file

@ -0,0 +1,684 @@
"""Custom OpenTelemetry metrics for SurfSense.
This module owns all SurfSense-specific metric instruments. Callers use the
small helper functions below instead of constructing instruments directly so
attribute names and cardinality stay consistent across the backend.
"""
from __future__ import annotations
import contextlib
import gc
import logging
from functools import lru_cache
from importlib import metadata
from typing import Any
from app.observability import otel
logger = logging.getLogger(__name__)
_INSTRUMENTATION_NAME = "surfsense.platform"
_OBSERVABLES_REGISTERED = False
_ERROR_CATEGORY_UNKNOWN = "unknown"
_ERROR_CATEGORY_HINTS: tuple[tuple[str, tuple[str, ...]], ...] = (
("rate_limited", ("ratelimit", "rate_limit", "toomanyrequests", "429")),
("auth_failed", ("authentication", "auth", "unauthorized", "forbidden")),
("quota_exhausted", ("quota", "insufficient", "credit", "billing")),
("timeout", ("timeout", "timedout", "deadline")),
("network_failed", ("connection", "connect", "network", "dns", "socket")),
("server_error", ("internalserver", "serviceunavailable", "badgateway", "gateway")),
("lock_contention", ("lock", "busy", "contention", "alreadyrunning")),
("unsupported_format", ("unsupported", "format", "filetype")),
("provider_error", ("provider", "apierror", "apistatus", "badrequest")),
)
def _package_version() -> str:
with contextlib.suppress(metadata.PackageNotFoundError):
return metadata.version("surf-new-backend")
return "unknown"
def _is_enabled() -> bool:
return otel.is_enabled()
def _clean_attrs(attrs: dict[str, Any]) -> dict[str, str | int | float | bool]:
"""Drop empty values and coerce low-cardinality attrs to OTel-safe scalars."""
cleaned: dict[str, str | int | float | bool] = {}
for key, value in attrs.items():
if value is None:
continue
if isinstance(value, bool | int | float):
cleaned[key] = value
continue
text = str(value)
if text:
cleaned[key] = text
return cleaned
def _attrs_with_optional_error_category(
attrs: dict[str, Any], error_category: str | None
) -> dict[str, Any]:
if error_category:
return {**attrs, "error.category": error_category}
return attrs
def categorize_exception(exc: BaseException | None) -> str:
"""Return a low-cardinality category for an exception."""
if exc is None:
return _ERROR_CATEGORY_UNKNOWN
haystack = " ".join(
cls.__name__.replace("-", "").replace("_", "").lower()
for cls in type(exc).__mro__
)
for category, hints in _ERROR_CATEGORY_HINTS:
if any(hint in haystack for hint in hints):
return category
return _ERROR_CATEGORY_UNKNOWN
def parse_celery_task_label(task_name: str | None) -> str:
"""Return the operation token from a Celery task name."""
if not task_name:
return "unknown"
operation = str(task_name).split("_", 1)[0].strip()
return operation or "unknown"
def _record(callable_obj: Any, value: int | float, attrs: dict[str, Any]) -> None:
if not _is_enabled():
return
with contextlib.suppress(Exception):
callable_obj.record(value, _clean_attrs(attrs))
def _add(callable_obj: Any, value: int, attrs: dict[str, Any]) -> None:
if not _is_enabled():
return
with contextlib.suppress(Exception):
callable_obj.add(value, _clean_attrs(attrs))
@lru_cache(maxsize=1)
def _get_meter():
from opentelemetry import metrics
return metrics.get_meter(_INSTRUMENTATION_NAME, _package_version())
@lru_cache(maxsize=1)
def _model_call_duration():
return _get_meter().create_histogram(
"surfsense.model.call.duration",
unit="ms",
description="Duration of SurfSense LLM model calls.",
)
@lru_cache(maxsize=1)
def _model_token_usage():
return _get_meter().create_histogram(
"gen_ai.client.token.usage",
unit="{token}",
description="Token usage reported by GenAI model responses.",
)
@lru_cache(maxsize=1)
def _tool_call_duration():
return _get_meter().create_histogram(
"surfsense.tool.call.duration",
unit="ms",
description="Duration of SurfSense agent tool calls.",
)
@lru_cache(maxsize=1)
def _tool_call_errors():
return _get_meter().create_counter(
"surfsense.tool.call.errors",
description="Count of SurfSense agent tool call errors.",
)
@lru_cache(maxsize=1)
def _kb_search_duration():
return _get_meter().create_histogram(
"surfsense.kb.search.duration",
unit="ms",
description="Duration of SurfSense knowledge-base search calls.",
)
@lru_cache(maxsize=1)
def _compaction_runs():
return _get_meter().create_counter(
"surfsense.compaction.runs",
description="Count of SurfSense conversation compaction runs.",
)
@lru_cache(maxsize=1)
def _permission_asks():
return _get_meter().create_counter(
"surfsense.permission.asks",
description="Count of SurfSense permission asks.",
)
@lru_cache(maxsize=1)
def _interrupts():
return _get_meter().create_counter(
"surfsense.interrupt.raised",
description="Count of SurfSense interrupts raised.",
)
@lru_cache(maxsize=1)
def _indexing_document_duration():
return _get_meter().create_histogram(
"surfsense.indexing.document.duration",
unit="s",
description="Duration of SurfSense document indexing.",
)
@lru_cache(maxsize=1)
def _indexing_document_outcome():
return _get_meter().create_counter(
"surfsense.indexing.document.outcome",
description="Count of SurfSense document indexing outcomes.",
)
@lru_cache(maxsize=1)
def _connector_sync_duration():
return _get_meter().create_histogram(
"surfsense.connector.sync.duration",
unit="s",
description="Duration of SurfSense connector sync tasks.",
)
@lru_cache(maxsize=1)
def _connector_sync_outcome():
return _get_meter().create_counter(
"surfsense.connector.sync.outcome",
description="Count of SurfSense connector sync outcomes.",
)
@lru_cache(maxsize=1)
def _auth_failures():
return _get_meter().create_counter(
"surfsense.auth.failures",
description="Count of SurfSense authentication failures.",
)
@lru_cache(maxsize=1)
def _rate_limit_rejections():
return _get_meter().create_counter(
"surfsense.rate_limit.rejections",
description="Count of SurfSense rate-limit rejections.",
)
@lru_cache(maxsize=1)
def _perf_elapsed():
return _get_meter().create_histogram(
"surfsense.perf.elapsed_ms",
unit="ms",
description="Elapsed time recorded by SurfSense perf timers.",
)
@lru_cache(maxsize=1)
def _chat_request_duration():
return _get_meter().create_histogram(
"surfsense.chat.request.duration",
unit="ms",
description="Duration of SurfSense streamed chat requests.",
)
@lru_cache(maxsize=1)
def _chat_request_outcome():
return _get_meter().create_counter(
"surfsense.chat.request.outcome",
description="Count of SurfSense chat request outcomes.",
)
@lru_cache(maxsize=1)
def _subagent_invoke_duration():
return _get_meter().create_histogram(
"surfsense.subagent.invoke.duration",
unit="ms",
description="Duration of SurfSense subagent invocations.",
)
@lru_cache(maxsize=1)
def _subagent_invoke_outcome():
return _get_meter().create_counter(
"surfsense.subagent.invoke.outcome",
description="Count of SurfSense subagent invocation outcomes.",
)
@lru_cache(maxsize=1)
def _etl_extract_duration():
return _get_meter().create_histogram(
"surfsense.etl.extract.duration",
unit="s",
description="Duration of SurfSense ETL extraction.",
)
@lru_cache(maxsize=1)
def _etl_extract_outcome():
return _get_meter().create_counter(
"surfsense.etl.extract.outcome",
description="Count of SurfSense ETL extraction outcomes.",
)
@lru_cache(maxsize=1)
def _celery_heartbeat_refreshes():
return _get_meter().create_counter(
"surfsense.celery.heartbeat.refreshes",
description="Count of SurfSense Celery heartbeat refreshes.",
)
@lru_cache(maxsize=1)
def _celery_heartbeat_failures():
return _get_meter().create_counter(
"surfsense.celery.heartbeat.failures",
description="Count of SurfSense Celery heartbeat failures.",
)
@lru_cache(maxsize=1)
def _celery_queue_latency():
return _get_meter().create_histogram(
"surfsense.celery.queue.latency",
unit="s",
description="Time SurfSense Celery tasks spend waiting in queue.",
)
def record_model_call_duration(
duration_ms: float, *, model: str | None, provider: str | None
) -> None:
_record(
_model_call_duration(),
duration_ms,
{
"gen_ai.request.model": model,
"gen_ai.provider.name": provider,
},
)
def record_model_token_usage(
*,
input_tokens: int | None,
output_tokens: int | None,
model: str | None,
provider: str | None,
) -> None:
base = {
"gen_ai.request.model": model,
"gen_ai.provider.name": provider,
"gen_ai.operation.name": "chat",
}
if input_tokens is not None:
_record(
_model_token_usage(),
int(input_tokens),
{**base, "gen_ai.token.type": "input"},
)
if output_tokens is not None:
_record(
_model_token_usage(),
int(output_tokens),
{**base, "gen_ai.token.type": "output"},
)
def record_tool_call_duration(duration_ms: float, *, tool_name: str) -> None:
_record(_tool_call_duration(), duration_ms, {"tool.name": tool_name})
def record_tool_call_error(*, tool_name: str) -> None:
_add(_tool_call_errors(), 1, {"tool.name": tool_name})
def record_kb_search_duration(
duration_ms: float, *, search_space_id: int | None, surface: str
) -> None:
_record(
_kb_search_duration(),
duration_ms,
{"search_space.id": search_space_id, "search.surface": surface},
)
def record_compaction_run(*, reason: str | None) -> None:
_add(_compaction_runs(), 1, {"compaction.reason": reason or "unknown"})
def record_permission_ask(*, permission: str) -> None:
_add(_permission_asks(), 1, {"permission.permission": permission})
def record_interrupt(*, interrupt_type: str) -> None:
_add(_interrupts(), 1, {"interrupt.type": interrupt_type})
def record_indexing_document_duration(
duration_s: float, *, document_type: str | None
) -> None:
_record(
_indexing_document_duration(),
duration_s,
{"document.type": document_type or "unknown"},
)
def record_indexing_document_outcome(*, document_type: str | None, status: str) -> None:
_add(
_indexing_document_outcome(),
1,
{"document.type": document_type or "unknown", "status": status},
)
def record_connector_sync_duration(
duration_s: float, *, connector_type: str | None
) -> None:
_record(
_connector_sync_duration(),
duration_s,
{"connector.type": connector_type or "unknown"},
)
def record_connector_sync_outcome(
*, connector_type: str | None, status: str, error_category: str | None = None
) -> None:
_add(
_connector_sync_outcome(),
1,
_attrs_with_optional_error_category(
{"connector.type": connector_type or "unknown", "status": status},
error_category,
),
)
def record_auth_failure(*, reason: str) -> None:
_add(_auth_failures(), 1, {"reason": reason})
def record_rate_limit_rejection(*, scope: str) -> None:
_add(_rate_limit_rejections(), 1, {"scope": scope})
def record_perf_elapsed(duration_ms: float, *, label: str) -> None:
_record(_perf_elapsed(), duration_ms, {"label": label})
def record_chat_request_duration(
duration_ms: float,
*,
flow: str,
outcome: str,
agent_mode: str | None = None,
) -> None:
_record(
_chat_request_duration(),
duration_ms,
{"chat.flow": flow, "outcome": outcome, "agent.mode": agent_mode},
)
def record_chat_request_outcome(
*,
flow: str,
outcome: str,
agent_mode: str | None = None,
error_category: str | None = None,
) -> None:
_add(
_chat_request_outcome(),
1,
_attrs_with_optional_error_category(
{"chat.flow": flow, "outcome": outcome, "agent.mode": agent_mode},
error_category,
),
)
def record_subagent_invoke_duration(
duration_ms: float,
*,
subagent_type: str,
path: str | None,
outcome: str,
) -> None:
_record(
_subagent_invoke_duration(),
duration_ms,
{
"subagent.type": subagent_type,
"subagent.path": path or "unknown",
"outcome": outcome,
},
)
def record_subagent_invoke_outcome(
*,
subagent_type: str,
path: str | None,
outcome: str,
) -> None:
_add(
_subagent_invoke_outcome(),
1,
{
"subagent.type": subagent_type,
"subagent.path": path or "unknown",
"outcome": outcome,
},
)
def record_etl_extract_duration(
duration_s: float,
*,
etl_service: str | None,
content_type: str | None,
status: str,
) -> None:
_record(
_etl_extract_duration(),
duration_s,
{
"etl.service": etl_service or "unknown",
"content.type": content_type or "unknown",
"status": status,
},
)
def record_etl_extract_outcome(
*,
etl_service: str | None,
content_type: str | None,
status: str,
error_category: str | None = None,
) -> None:
_add(
_etl_extract_outcome(),
1,
_attrs_with_optional_error_category(
{
"etl.service": etl_service or "unknown",
"content.type": content_type or "unknown",
"status": status,
},
error_category,
),
)
def record_celery_heartbeat_refresh(*, heartbeat_type: str) -> None:
_add(_celery_heartbeat_refreshes(), 1, {"heartbeat.type": heartbeat_type})
def record_celery_heartbeat_failure(*, heartbeat_type: str) -> None:
_add(_celery_heartbeat_failures(), 1, {"heartbeat.type": heartbeat_type})
def record_celery_queue_latency(
duration_s: float,
*,
task_name: str | None,
queue: str | None,
scheduled: bool,
operation: str | None,
) -> None:
_record(
_celery_queue_latency(),
duration_s,
{
"task.name": task_name or "unknown",
"task.queue": queue or "unknown",
"task.scheduled": bool(scheduled),
"operation": operation or "unknown",
},
)
def _runtime_snapshot_value(key: str, transform: Any = None) -> list[Any]:
from opentelemetry.metrics import Observation
from app.utils.perf import system_snapshot
snap = system_snapshot()
value = snap.get(key)
if not isinstance(value, int | float) or value < 0:
return []
if transform is not None:
value = transform(value)
return [Observation(value)]
def _observe_gc_collections(_options: Any) -> list[Any]:
from opentelemetry.metrics import Observation
return [
Observation(count, {"generation": str(generation)})
for generation, count in enumerate(gc.get_count())
]
def register_runtime_observables() -> None:
"""Register process/runtime observable gauges once per process."""
global _OBSERVABLES_REGISTERED
if _OBSERVABLES_REGISTERED or not _is_enabled():
return
meter = _get_meter()
try:
# Each callback returns the value for a single gauge except GC, whose
# callback carries a generation attribute.
meter.create_observable_gauge(
"process.runtime.cpython.memory.rss",
callbacks=[
lambda _options: _runtime_snapshot_value(
"rss_mb", lambda v: float(v) * 1024 * 1024
)
],
unit="By",
description="Resident set size of the SurfSense backend process.",
)
meter.create_observable_gauge(
"process.runtime.cpython.cpu.utilization",
callbacks=[
lambda _options: _runtime_snapshot_value(
"cpu_percent", lambda v: float(v) / 100.0
)
],
unit="1",
description="CPU utilization of the SurfSense backend process.",
)
meter.create_observable_gauge(
"process.runtime.cpython.threads",
callbacks=[lambda _options: _runtime_snapshot_value("threads")],
unit="{thread}",
description="Thread count of the SurfSense backend process.",
)
meter.create_observable_gauge(
"process.runtime.cpython.open_fds",
callbacks=[lambda _options: _runtime_snapshot_value("open_fds")],
unit="{fd}",
description="Open file descriptor count of the SurfSense backend process.",
)
meter.create_observable_gauge(
"python.asyncio.tasks",
callbacks=[lambda _options: _runtime_snapshot_value("asyncio_tasks")],
unit="{task}",
description="Live asyncio task count in the current event loop.",
)
meter.create_observable_gauge(
"process.runtime.cpython.gc.collections",
callbacks=[_observe_gc_collections],
unit="{collection}",
description="CPython GC counters by generation.",
)
except Exception:
logger.warning("Failed to register OTel runtime observables", exc_info=True)
return
_OBSERVABLES_REGISTERED = True
__all__ = [
"categorize_exception",
"parse_celery_task_label",
"record_auth_failure",
"record_celery_heartbeat_failure",
"record_celery_heartbeat_refresh",
"record_celery_queue_latency",
"record_chat_request_duration",
"record_chat_request_outcome",
"record_compaction_run",
"record_connector_sync_duration",
"record_connector_sync_outcome",
"record_etl_extract_duration",
"record_etl_extract_outcome",
"record_indexing_document_duration",
"record_indexing_document_outcome",
"record_interrupt",
"record_kb_search_duration",
"record_model_call_duration",
"record_model_token_usage",
"record_perf_elapsed",
"record_permission_ask",
"record_rate_limit_rejection",
"record_subagent_invoke_duration",
"record_subagent_invoke_outcome",
"record_tool_call_duration",
"record_tool_call_error",
"register_runtime_observables",
]

View file

@ -66,6 +66,8 @@ def _resolve_enabled() -> bool:
# Honor an explicit kill-switch first. # Honor an explicit kill-switch first.
if os.environ.get("SURFSENSE_DISABLE_OTEL", "").lower() in {"1", "true", "yes"}: if os.environ.get("SURFSENSE_DISABLE_OTEL", "").lower() in {"1", "true", "yes"}:
return False return False
if os.environ.get("OTEL_SDK_DISABLED", "").lower() in {"1", "true", "yes", "on"}:
return False
# Treat a configured endpoint as the canonical "OTel is wired up" signal. # Treat a configured endpoint as the canonical "OTel is wired up" signal.
if os.environ.get("OTEL_EXPORTER_OTLP_ENDPOINT"): if os.environ.get("OTEL_EXPORTER_OTLP_ENDPOINT"):
return True return True
@ -90,6 +92,48 @@ def is_enabled() -> bool:
return _ENABLED return _ENABLED
def _clean_event_attrs(attrs: dict[str, Any]) -> dict[str, str | int | float | bool]:
"""Coerce event attributes to OTel-safe scalar values."""
cleaned: dict[str, str | int | float | bool] = {}
for key, value in attrs.items():
if value is None:
continue
if isinstance(value, bool | int | float):
cleaned[key] = value
continue
text = str(value)
if text:
cleaned[key] = text
return cleaned
def add_event(name: str, attributes: dict[str, Any] | None = None) -> None:
"""Attach an event to the current active span.
This is intentionally no-op and exception-safe when OTel is disabled,
unavailable, or no span is currently recording.
"""
if not _ENABLED or _ot_trace is None:
return
with contextlib.suppress(Exception):
sp = _ot_trace.get_current_span()
if sp is None or not sp.is_recording():
return
sp.add_event(
name,
attributes=_clean_event_attrs(attributes) if attributes else None,
)
def record_error(span_obj: Any, exc: BaseException) -> None:
"""Record an exception and mark a span as errored without re-raising."""
if not _ENABLED:
return
with contextlib.suppress(Exception):
span_obj.record_exception(exc)
span_obj.set_status(_OtStatus(_OtStatusCode.ERROR, str(exc)))
def _get_tracer(): def _get_tracer():
if not _OTEL_AVAILABLE: if not _OTEL_AVAILABLE:
return None return None
@ -198,8 +242,11 @@ def model_call_span(
attrs: dict[str, Any] = {} attrs: dict[str, Any] = {}
if model_id: if model_id:
attrs["model.id"] = model_id attrs["model.id"] = model_id
attrs["gen_ai.request.model"] = model_id
if provider: if provider:
attrs["model.provider"] = provider attrs["model.provider"] = provider
attrs["gen_ai.provider.name"] = provider
attrs["gen_ai.operation.name"] = "chat"
if extra: if extra:
attrs.update(extra) attrs.update(extra)
return span("model.call", attributes=attrs) return span("model.call", attributes=attrs)
@ -239,6 +286,152 @@ def kb_persist_span(
return span("kb.persist", attributes=attrs) return span("kb.persist", attributes=attrs)
def chat_request_span(
*,
chat_id: int | None = None,
search_space_id: int | None = None,
flow: str | None = None,
request_id: str | None = None,
turn_id: str | None = None,
filesystem_mode: str | None = None,
client_platform: str | None = None,
agent_mode: str | None = None,
extra: dict[str, Any] | None = None,
):
"""Parent span for a single streamed chat or resume turn."""
attrs: dict[str, Any] = {}
if chat_id is not None:
attrs["chat.id"] = int(chat_id)
if search_space_id is not None:
attrs["search_space.id"] = int(search_space_id)
if flow:
attrs["chat.flow"] = flow
if request_id:
attrs["request.id"] = request_id
if turn_id:
attrs["turn.id"] = turn_id
if filesystem_mode:
attrs["filesystem.mode"] = filesystem_mode
if client_platform:
attrs["client.platform"] = client_platform
if agent_mode:
attrs["agent.mode"] = agent_mode
if extra:
attrs.update(extra)
return span("chat.request", attributes=attrs)
def subagent_invoke_span(
*,
subagent_type: str,
path: str | None = None,
extra: dict[str, Any] | None = None,
):
"""Span around invoking a delegated subagent from the main agent."""
attrs: dict[str, Any] = {"subagent.type": subagent_type}
if path:
attrs["subagent.path"] = path
if extra:
attrs.update(extra)
return span("subagent.invoke", attributes=attrs)
def connector_sync_span(
*,
connector_type: str | None,
extra: dict[str, Any] | None = None,
):
"""Business-level span around connector indexing task execution."""
attrs: dict[str, Any] = {"connector.type": connector_type or "unknown"}
if extra:
attrs.update(extra)
return span("connector.sync", attributes=attrs)
def etl_extract_span(
*,
content_type: str | None = None,
file_extension: str | None = None,
processing_mode: str | None = None,
extra: dict[str, Any] | None = None,
):
"""Span around top-level ETL extraction for a file."""
attrs: dict[str, Any] = {}
if content_type:
attrs["content.type"] = content_type
if file_extension:
attrs["file.extension"] = file_extension
if processing_mode:
attrs["processing.mode"] = processing_mode
if extra:
attrs.update(extra)
return span("etl.extract", attributes=attrs)
def etl_parse_span(
*,
etl_service: str | None,
content_type: str | None = None,
file_extension: str | None = None,
processing_mode: str | None = None,
extra: dict[str, Any] | None = None,
):
"""Span around a concrete ETL parser/backend call."""
attrs: dict[str, Any] = {"etl.service": etl_service or "unknown"}
if content_type:
attrs["content.type"] = content_type
if file_extension:
attrs["file.extension"] = file_extension
if processing_mode:
attrs["processing.mode"] = processing_mode
if extra:
attrs.update(extra)
return span("etl.parse", attributes=attrs)
def etl_ocr_span(
*,
etl_service: str | None,
file_extension: str | None = None,
extra: dict[str, Any] | None = None,
):
"""Span around OCR extraction from image content."""
attrs: dict[str, Any] = {"etl.service": etl_service or "unknown"}
if file_extension:
attrs["file.extension"] = file_extension
if extra:
attrs.update(extra)
return span("etl.ocr", attributes=attrs)
def etl_picture_describe_span(
*,
image_count: int | None = None,
extra: dict[str, Any] | None = None,
):
"""Span around describing embedded images in a document."""
attrs: dict[str, Any] = {}
if image_count is not None:
attrs["image.count"] = int(image_count)
if extra:
attrs.update(extra)
return span("etl.picture.describe", attributes=attrs)
def etl_picture_ocr_span(
*,
file_extension: str | None = None,
extra: dict[str, Any] | None = None,
):
"""Span around per-image OCR during picture description."""
attrs: dict[str, Any] = {}
if file_extension:
attrs["file.extension"] = file_extension
if extra:
attrs.update(extra)
return span("etl.picture.ocr", attributes=attrs)
def compaction_span( def compaction_span(
*, *,
reason: str | None = None, reason: str | None = None,
@ -301,14 +494,24 @@ def reload_for_tests() -> bool:
__all__ = [ __all__ = [
"add_event",
"chat_request_span",
"compaction_span", "compaction_span",
"connector_sync_span",
"etl_extract_span",
"etl_ocr_span",
"etl_parse_span",
"etl_picture_describe_span",
"etl_picture_ocr_span",
"interrupt_span", "interrupt_span",
"is_enabled", "is_enabled",
"kb_persist_span", "kb_persist_span",
"kb_search_span", "kb_search_span",
"model_call_span", "model_call_span",
"permission_asked_span", "permission_asked_span",
"record_error",
"reload_for_tests", "reload_for_tests",
"span", "span",
"subagent_invoke_span",
"tool_call_span", "tool_call_span",
] ]

View file

@ -1,13 +1,51 @@
import asyncio import asyncio
import contextlib import contextlib
import functools
import time import time
from datetime import datetime from datetime import datetime
from app.observability import metrics as ot_metrics, otel as ot
from app.utils.perf import get_perf_logger from app.utils.perf import get_perf_logger
_MAX_FETCH_CHUNKS_PER_DOC = 20 _MAX_FETCH_CHUNKS_PER_DOC = 20
def _instrument_search(mode: str):
def _decorator(func):
@functools.wraps(func)
async def _wrapper(
self, query_text: str, top_k: int, search_space_id: int, *args, **kwargs
):
t0 = time.perf_counter()
with ot.kb_search_span(
search_space_id=search_space_id,
query_chars=len(query_text),
extra={"search.surface": "chunks", "search.mode": mode},
) as sp:
try:
result = await func(
self, query_text, top_k, search_space_id, *args, **kwargs
)
except Exception:
ot_metrics.record_kb_search_duration(
(time.perf_counter() - t0) * 1000,
search_space_id=search_space_id,
surface="chunks",
)
raise
sp.set_attribute("result.count", len(result))
ot_metrics.record_kb_search_duration(
(time.perf_counter() - t0) * 1000,
search_space_id=search_space_id,
surface="chunks",
)
return result
return _wrapper
return _decorator
class ChucksHybridSearchRetriever: class ChucksHybridSearchRetriever:
def __init__(self, db_session): def __init__(self, db_session):
""" """
@ -18,6 +56,7 @@ class ChucksHybridSearchRetriever:
""" """
self.db_session = db_session self.db_session = db_session
@_instrument_search("vector")
async def vector_search( async def vector_search(
self, self,
query_text: str, query_text: str,
@ -88,6 +127,7 @@ class ChucksHybridSearchRetriever:
return chunks return chunks
@_instrument_search("full_text")
async def full_text_search( async def full_text_search(
self, self,
query_text: str, query_text: str,
@ -153,6 +193,7 @@ class ChucksHybridSearchRetriever:
return chunks return chunks
@_instrument_search("hybrid")
async def hybrid_search( async def hybrid_search(
self, self,
query_text: str, query_text: str,

View file

@ -1,12 +1,50 @@
import contextlib import contextlib
import functools
import time import time
from datetime import datetime from datetime import datetime
from app.observability import metrics as ot_metrics, otel as ot
from app.utils.perf import get_perf_logger from app.utils.perf import get_perf_logger
_MAX_FETCH_CHUNKS_PER_DOC = 20 _MAX_FETCH_CHUNKS_PER_DOC = 20
def _instrument_search(mode: str):
def _decorator(func):
@functools.wraps(func)
async def _wrapper(
self, query_text: str, top_k: int, search_space_id: int, *args, **kwargs
):
t0 = time.perf_counter()
with ot.kb_search_span(
search_space_id=search_space_id,
query_chars=len(query_text),
extra={"search.surface": "documents", "search.mode": mode},
) as sp:
try:
result = await func(
self, query_text, top_k, search_space_id, *args, **kwargs
)
except Exception:
ot_metrics.record_kb_search_duration(
(time.perf_counter() - t0) * 1000,
search_space_id=search_space_id,
surface="documents",
)
raise
sp.set_attribute("result.count", len(result))
ot_metrics.record_kb_search_duration(
(time.perf_counter() - t0) * 1000,
search_space_id=search_space_id,
surface="documents",
)
return result
return _wrapper
return _decorator
class DocumentHybridSearchRetriever: class DocumentHybridSearchRetriever:
def __init__(self, db_session): def __init__(self, db_session):
""" """
@ -17,6 +55,7 @@ class DocumentHybridSearchRetriever:
""" """
self.db_session = db_session self.db_session = db_session
@_instrument_search("vector")
async def vector_search( async def vector_search(
self, self,
query_text: str, query_text: str,
@ -81,6 +120,7 @@ class DocumentHybridSearchRetriever:
return documents return documents
@_instrument_search("full_text")
async def full_text_search( async def full_text_search(
self, self,
query_text: str, query_text: str,
@ -145,6 +185,7 @@ class DocumentHybridSearchRetriever:
return documents return documents
@_instrument_search("hybrid")
async def hybrid_search( async def hybrid_search(
self, self,
query_text: str, query_text: str,

View file

@ -43,6 +43,7 @@ from app.db import (
async_session_maker, async_session_maker,
get_async_session, get_async_session,
) )
from app.observability import metrics as ot_metrics, otel as ot
from app.schemas import ( from app.schemas import (
GoogleDriveIndexRequest, GoogleDriveIndexRequest,
MCPConnectorCreate, MCPConnectorCreate,
@ -104,7 +105,9 @@ async def _run_indexing_heartbeat_loop(notification_id: int) -> None:
await asyncio.sleep(HEARTBEAT_REFRESH_INTERVAL) await asyncio.sleep(HEARTBEAT_REFRESH_INTERVAL)
try: try:
get_heartbeat_redis_client().setex(key, HEARTBEAT_TTL_SECONDS, "alive") get_heartbeat_redis_client().setex(key, HEARTBEAT_TTL_SECONDS, "alive")
ot_metrics.record_celery_heartbeat_refresh(heartbeat_type="connector")
except Exception as e: except Exception as e:
ot_metrics.record_celery_heartbeat_failure(heartbeat_type="connector")
logger.warning( logger.warning(
f"Failed to refresh Redis heartbeat for notification " f"Failed to refresh Redis heartbeat for notification "
f"{notification_id}: {e}" f"{notification_id}: {e}"
@ -1243,6 +1246,12 @@ async def _persist_auth_expired(session: AsyncSession, connector_id: int) -> Non
"""Flag a connector as auth_expired so the frontend shows a re-auth prompt.""" """Flag a connector as auth_expired so the frontend shows a re-auth prompt."""
from sqlalchemy.orm.attributes import flag_modified from sqlalchemy.orm.attributes import flag_modified
ot.add_event(
"connector.auth.expired",
{
"error.category": "auth_failed",
},
)
try: try:
result = await session.execute( result = await session.execute(
select(SearchSourceConnector).where( select(SearchSourceConnector).where(
@ -1302,6 +1311,13 @@ async def _run_indexing_with_notifications(
try: try:
connector_lock_acquired = acquire_connector_indexing_lock(connector_id) connector_lock_acquired = acquire_connector_indexing_lock(connector_id)
if not connector_lock_acquired: if not connector_lock_acquired:
ot.add_event(
"connector.sync.skipped",
{
"skip.reason": "lock_contention",
"error.category": "lock_contention",
},
)
logger.info( logger.info(
f"Skipping indexing for connector {connector_id} " f"Skipping indexing for connector {connector_id} "
"(another worker already holds Redis connector lock)" "(another worker already holds Redis connector lock)"
@ -1338,7 +1354,13 @@ async def _run_indexing_with_notifications(
get_heartbeat_redis_client().setex( get_heartbeat_redis_client().setex(
heartbeat_key, HEARTBEAT_TTL_SECONDS, "0" heartbeat_key, HEARTBEAT_TTL_SECONDS, "0"
) )
ot_metrics.record_celery_heartbeat_refresh(
heartbeat_type="connector"
)
except Exception as e: except Exception as e:
ot_metrics.record_celery_heartbeat_failure(
heartbeat_type="connector"
)
logger.warning(f"Failed to set initial Redis heartbeat: {e}") logger.warning(f"Failed to set initial Redis heartbeat: {e}")
# Start a background coroutine that refreshes the # Start a background coroutine that refreshes the
@ -1366,6 +1388,15 @@ async def _run_indexing_with_notifications(
) -> None: ) -> None:
"""Callback to update notification during API retries (rate limits, etc.)""" """Callback to update notification during API retries (rate limits, etc.)"""
nonlocal notification nonlocal notification
ot.add_event(
"connector.retry.scheduled",
{
"retry.reason": retry_reason,
"retry.attempt": attempt,
"retry.max": max_attempts,
"retry.delay_ms": int(wait_seconds * 1000),
},
)
if notification: if notification:
try: try:
await session.refresh(notification) await session.refresh(notification)
@ -1397,8 +1428,14 @@ async def _run_indexing_with_notifications(
get_heartbeat_redis_client().setex( get_heartbeat_redis_client().setex(
heartbeat_key, HEARTBEAT_TTL_SECONDS, str(indexed_count) heartbeat_key, HEARTBEAT_TTL_SECONDS, str(indexed_count)
) )
ot_metrics.record_celery_heartbeat_refresh(
heartbeat_type="connector"
)
except Exception as e: except Exception as e:
# Don't let Redis errors break the indexing # Don't let Redis errors break the indexing
ot_metrics.record_celery_heartbeat_failure(
heartbeat_type="connector"
)
logger.warning(f"Failed to set Redis heartbeat: {e}") logger.warning(f"Failed to set Redis heartbeat: {e}")
try: try:

View file

@ -37,6 +37,10 @@ def get_celery_session_maker() -> async_sessionmaker:
poolclass=NullPool, poolclass=NullPool,
echo=False, echo=False,
) )
with contextlib.suppress(Exception):
from app.observability.bootstrap import instrument_sqlalchemy_engine
instrument_sqlalchemy_engine(_celery_engine)
_celery_session_maker = async_sessionmaker( _celery_session_maker = async_sessionmaker(
_celery_engine, expire_on_commit=False _celery_engine, expire_on_commit=False
) )

View file

@ -1,14 +1,52 @@
"""Celery tasks for connector indexing.""" """Celery tasks for connector indexing."""
import logging import logging
import time
import traceback import traceback
from collections.abc import Awaitable, Callable
from celery import current_task
from app.celery_app import celery_app from app.celery_app import celery_app
from app.tasks.celery_tasks import get_celery_session_maker, run_async_celery_task from app.observability import metrics as ot_metrics, otel as ot
from app.tasks.celery_tasks import (
get_celery_session_maker,
run_async_celery_task as _run_async_celery_task,
)
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
def run_async_celery_task[T](coro_factory: Callable[[], Awaitable[T]]) -> T:
"""Run connector sync work and record aggregate connector metrics."""
task_name = getattr(current_task, "name", None) or "unknown"
t0 = time.perf_counter()
status = "failed"
error_category: str | None = None
try:
with ot.connector_sync_span(connector_type=task_name) as sp:
try:
result = _run_async_celery_task(coro_factory)
sp.set_attribute("connector.status", "success")
except Exception as exc:
error_category = ot_metrics.categorize_exception(exc)
sp.set_attribute("connector.error.category", error_category)
raise
status = "success"
return result
finally:
elapsed_s = time.perf_counter() - t0
ot_metrics.record_connector_sync_duration(
elapsed_s,
connector_type=task_name,
)
ot_metrics.record_connector_sync_outcome(
connector_type=task_name,
status=status,
error_category=error_category,
)
def _handle_greenlet_error(e: Exception, task_name: str, connector_id: int) -> None: def _handle_greenlet_error(e: Exception, task_name: str, connector_id: int) -> None:
""" """
Handle greenlet_spawn errors with detailed logging for debugging. Handle greenlet_spawn errors with detailed logging for debugging.

View file

@ -9,6 +9,7 @@ from uuid import UUID
from app.celery_app import celery_app from app.celery_app import celery_app
from app.config import config from app.config import config
from app.observability import metrics as ot_metrics
from app.services.notification_service import NotificationService from app.services.notification_service import NotificationService
from app.services.task_logging_service import TaskLoggingService from app.services.task_logging_service import TaskLoggingService
from app.tasks.celery_tasks import get_celery_session_maker, run_async_celery_task from app.tasks.celery_tasks import get_celery_session_maker, run_async_celery_task
@ -59,7 +60,9 @@ def _start_heartbeat(notification_id: int) -> None:
try: try:
key = _get_heartbeat_key(notification_id) key = _get_heartbeat_key(notification_id)
_get_doc_heartbeat_redis().setex(key, HEARTBEAT_TTL_SECONDS, "started") _get_doc_heartbeat_redis().setex(key, HEARTBEAT_TTL_SECONDS, "started")
ot_metrics.record_celery_heartbeat_refresh(heartbeat_type="document")
except Exception as e: except Exception as e:
ot_metrics.record_celery_heartbeat_failure(heartbeat_type="document")
logger.warning( logger.warning(
f"Failed to set initial heartbeat for notification {notification_id}: {e}" f"Failed to set initial heartbeat for notification {notification_id}: {e}"
) )
@ -87,7 +90,9 @@ async def _run_heartbeat_loop(notification_id: int):
await asyncio.sleep(HEARTBEAT_REFRESH_INTERVAL) await asyncio.sleep(HEARTBEAT_REFRESH_INTERVAL)
try: try:
_get_doc_heartbeat_redis().setex(key, HEARTBEAT_TTL_SECONDS, "alive") _get_doc_heartbeat_redis().setex(key, HEARTBEAT_TTL_SECONDS, "alive")
ot_metrics.record_celery_heartbeat_refresh(heartbeat_type="document")
except Exception as e: except Exception as e:
ot_metrics.record_celery_heartbeat_failure(heartbeat_type="document")
logger.warning( logger.warning(
f"Failed to refresh heartbeat for notification {notification_id}: {e}" f"Failed to refresh heartbeat for notification {notification_id}: {e}"
) )

View file

@ -14,6 +14,7 @@ import contextlib
import gc import gc
import json import json
import logging import logging
import sys
import time import time
from collections.abc import AsyncGenerator from collections.abc import AsyncGenerator
from dataclasses import dataclass, field from dataclasses import dataclass, field
@ -58,6 +59,7 @@ from app.db import (
async_session_maker, async_session_maker,
shielded_async_session, shielded_async_session,
) )
from app.observability import metrics as ot_metrics, otel as ot
from app.prompts import TITLE_GENERATION_PROMPT from app.prompts import TITLE_GENERATION_PROMPT
from app.services.auto_model_pin_service import ( from app.services.auto_model_pin_service import (
mark_runtime_cooldown, mark_runtime_cooldown,
@ -883,6 +885,20 @@ async def stream_new_chat(
stream_result.turn_id = f"{chat_id}:{int(time.time() * 1000)}" stream_result.turn_id = f"{chat_id}:{int(time.time() * 1000)}"
stream_result.filesystem_mode = fs_mode stream_result.filesystem_mode = fs_mode
stream_result.client_platform = fs_platform stream_result.client_platform = fs_platform
chat_agent_mode = "unknown"
chat_outcome = "success"
chat_error_category: str | None = None
chat_span_cm = ot.chat_request_span(
chat_id=chat_id,
search_space_id=search_space_id,
flow=flow,
request_id=request_id,
turn_id=stream_result.turn_id,
filesystem_mode=fs_mode,
client_platform=fs_platform,
agent_mode=chat_agent_mode,
)
chat_span = chat_span_cm.__enter__()
_log_file_contract("turn_start", stream_result) _log_file_contract("turn_start", stream_result)
_perf_log.info( _perf_log.info(
"[stream_new_chat] filesystem_mode=%s client_platform=%s", "[stream_new_chat] filesystem_mode=%s client_platform=%s",
@ -971,6 +987,14 @@ async def stream_new_chat(
requires_image_input=_requires_image_input, requires_image_input=_requires_image_input,
) )
).resolved_llm_config_id ).resolved_llm_config_id
ot.add_event(
"model.pin.resolved",
{
"pin.requested_id": requested_llm_config_id,
"pin.resolved_id": llm_config_id,
"pin.requires_image_input": _requires_image_input,
},
)
except ValueError as pin_error: except ValueError as pin_error:
# Auto-pin's "no vision-capable cfg" path raises a ValueError # Auto-pin's "no vision-capable cfg" path raises a ValueError
# whose message we map to the friendly image-input SSE error # whose message we map to the friendly image-input SSE error
@ -987,6 +1011,13 @@ async def stream_new_chat(
if error_code == "MODEL_DOES_NOT_SUPPORT_IMAGE_INPUT" if error_code == "MODEL_DOES_NOT_SUPPORT_IMAGE_INPUT"
else "server_error" else "server_error"
) )
if error_code == "MODEL_DOES_NOT_SUPPORT_IMAGE_INPUT":
ot.add_event(
"quota.denied",
{
"quota.code": error_code,
},
)
yield _emit_stream_error( yield _emit_stream_error(
message=str(pin_error), message=str(pin_error),
error_kind=error_kind, error_kind=error_kind,
@ -1041,6 +1072,12 @@ async def stream_new_chat(
model_label = ( model_label = (
agent_config.config_name or agent_config.model_name or "model" agent_config.config_name or agent_config.model_name or "model"
) )
ot.add_event(
"quota.denied",
{
"quota.code": "MODEL_DOES_NOT_SUPPORT_IMAGE_INPUT",
},
)
yield _emit_stream_error( yield _emit_stream_error(
message=( message=(
f"The selected model ({model_label}) does not support " f"The selected model ({model_label}) does not support "
@ -1084,6 +1121,12 @@ async def stream_new_chat(
) )
_premium_reserved_micros = reserve_amount_micros _premium_reserved_micros = reserve_amount_micros
if not quota_result.allowed: if not quota_result.allowed:
ot.add_event(
"quota.denied",
{
"quota.code": "PREMIUM_QUOTA_EXHAUSTED",
},
)
if requested_llm_config_id == 0: if requested_llm_config_id == 0:
try: try:
llm_config_id = ( llm_config_id = (
@ -1097,6 +1140,13 @@ async def stream_new_chat(
requires_image_input=_requires_image_input, requires_image_input=_requires_image_input,
) )
).resolved_llm_config_id ).resolved_llm_config_id
ot.add_event(
"model.repin",
{
"repin.reason": "premium_quota_exhausted",
"repin.to_config_id": llm_config_id,
},
)
except ValueError as pin_error: except ValueError as pin_error:
yield _emit_stream_error( yield _emit_stream_error(
message=str(pin_error), message=str(pin_error),
@ -1189,6 +1239,9 @@ async def stream_new_chat(
from app.config import config as _app_config from app.config import config as _app_config
use_multi_agent = bool(_app_config.MULTI_AGENT_CHAT_ENABLED) use_multi_agent = bool(_app_config.MULTI_AGENT_CHAT_ENABLED)
chat_agent_mode = "multi" if use_multi_agent else "single"
with contextlib.suppress(Exception):
chat_span.set_attribute("agent.mode", chat_agent_mode)
_t0 = time.perf_counter() _t0 = time.perf_counter()
agent_factory = ( agent_factory = (
@ -1863,6 +1916,14 @@ async def stream_new_chat(
llm_config_id, llm_config_id,
time.perf_counter() - _t0, time.perf_counter() - _t0,
) )
ot.add_event(
"chat.rate_limit.recovered",
{
"recovery.reason": "provider_rate_limited",
"recovery.previous_config_id": previous_config_id,
"recovery.fallback_config_id": llm_config_id,
},
)
_log_chat_stream_error( _log_chat_stream_error(
flow=flow, flow=flow,
error_kind="rate_limited", error_kind="rate_limited",
@ -1893,6 +1954,12 @@ async def stream_new_chat(
log_system_snapshot("stream_new_chat_END") log_system_snapshot("stream_new_chat_END")
if stream_result.is_interrupted: if stream_result.is_interrupted:
ot.add_event(
"chat.interrupted",
{
"chat.flow": flow,
},
)
if title_task is not None and not title_task.done(): if title_task is not None and not title_task.done():
title_task.cancel() title_task.cancel()
@ -2011,6 +2078,12 @@ async def stream_new_chat(
user_message, user_message,
error_extra, error_extra,
) = _classify_stream_exception(e, flow_label="chat") ) = _classify_stream_exception(e, flow_label="chat")
chat_outcome = error_code or error_kind or "error"
chat_error_category = ot_metrics.categorize_exception(e)
with contextlib.suppress(Exception):
chat_span.set_attribute("chat.outcome", chat_outcome)
chat_span.set_attribute("error.category", chat_error_category)
ot.record_error(chat_span, e)
error_message = f"Error during chat: {e!s}" error_message = f"Error during chat: {e!s}"
print(f"[stream_new_chat] {error_message}") print(f"[stream_new_chat] {error_message}")
print(f"[stream_new_chat] Exception type: {type(e).__name__}") print(f"[stream_new_chat] Exception type: {type(e).__name__}")
@ -2201,6 +2274,21 @@ async def stream_new_chat(
) )
trim_native_heap() trim_native_heap()
log_system_snapshot("stream_new_chat_END") log_system_snapshot("stream_new_chat_END")
with contextlib.suppress(Exception):
chat_span.set_attribute("chat.outcome", chat_outcome)
ot_metrics.record_chat_request_duration(
(time.perf_counter() - _t_total) * 1000,
flow=flow,
outcome=chat_outcome,
agent_mode=chat_agent_mode,
)
ot_metrics.record_chat_request_outcome(
flow=flow,
outcome=chat_outcome,
agent_mode=chat_agent_mode,
error_category=chat_error_category,
)
chat_span_cm.__exit__(*sys.exc_info())
async def stream_resume_chat( async def stream_resume_chat(
@ -2225,6 +2313,20 @@ async def stream_resume_chat(
stream_result.turn_id = f"{chat_id}:{int(time.time() * 1000)}" stream_result.turn_id = f"{chat_id}:{int(time.time() * 1000)}"
stream_result.filesystem_mode = fs_mode stream_result.filesystem_mode = fs_mode
stream_result.client_platform = fs_platform stream_result.client_platform = fs_platform
chat_agent_mode = "unknown"
chat_outcome = "success"
chat_error_category: str | None = None
chat_span_cm = ot.chat_request_span(
chat_id=chat_id,
search_space_id=search_space_id,
flow="resume",
request_id=request_id,
turn_id=stream_result.turn_id,
filesystem_mode=fs_mode,
client_platform=fs_platform,
agent_mode=chat_agent_mode,
)
chat_span = chat_span_cm.__enter__()
_log_file_contract("turn_start", stream_result) _log_file_contract("turn_start", stream_result)
_perf_log.info( _perf_log.info(
"[stream_resume] filesystem_mode=%s client_platform=%s", "[stream_resume] filesystem_mode=%s client_platform=%s",
@ -2297,6 +2399,14 @@ async def stream_resume_chat(
selected_llm_config_id=llm_config_id, selected_llm_config_id=llm_config_id,
) )
).resolved_llm_config_id ).resolved_llm_config_id
ot.add_event(
"model.pin.resolved",
{
"pin.requested_id": requested_llm_config_id,
"pin.resolved_id": llm_config_id,
"pin.requires_image_input": False,
},
)
except ValueError as pin_error: except ValueError as pin_error:
yield _emit_stream_error( yield _emit_stream_error(
message=str(pin_error), message=str(pin_error),
@ -2353,6 +2463,12 @@ async def stream_resume_chat(
) )
_resume_premium_reserved_micros = reserve_amount_micros _resume_premium_reserved_micros = reserve_amount_micros
if not quota_result.allowed: if not quota_result.allowed:
ot.add_event(
"quota.denied",
{
"quota.code": "PREMIUM_QUOTA_EXHAUSTED",
},
)
if requested_llm_config_id == 0: if requested_llm_config_id == 0:
try: try:
llm_config_id = ( llm_config_id = (
@ -2365,6 +2481,13 @@ async def stream_resume_chat(
force_repin_free=True, force_repin_free=True,
) )
).resolved_llm_config_id ).resolved_llm_config_id
ot.add_event(
"model.repin",
{
"repin.reason": "premium_quota_exhausted",
"repin.to_config_id": llm_config_id,
},
)
except ValueError as pin_error: except ValueError as pin_error:
yield _emit_stream_error( yield _emit_stream_error(
message=str(pin_error), message=str(pin_error),
@ -2454,6 +2577,11 @@ async def stream_resume_chat(
visibility = thread_visibility or ChatVisibility.PRIVATE visibility = thread_visibility or ChatVisibility.PRIVATE
from app.config import config as _app_config from app.config import config as _app_config
chat_agent_mode = (
"multi" if _app_config.MULTI_AGENT_CHAT_ENABLED else "single"
)
with contextlib.suppress(Exception):
chat_span.set_attribute("agent.mode", chat_agent_mode)
_t0 = time.perf_counter() _t0 = time.perf_counter()
agent_factory = ( agent_factory = (
create_multi_agent_chat_deep_agent create_multi_agent_chat_deep_agent
@ -2695,6 +2823,14 @@ async def stream_resume_chat(
llm_config_id, llm_config_id,
time.perf_counter() - _t0, time.perf_counter() - _t0,
) )
ot.add_event(
"chat.rate_limit.recovered",
{
"recovery.reason": "provider_rate_limited",
"recovery.previous_config_id": previous_config_id,
"recovery.fallback_config_id": llm_config_id,
},
)
_log_chat_stream_error( _log_chat_stream_error(
flow="resume", flow="resume",
error_kind="rate_limited", error_kind="rate_limited",
@ -2722,6 +2858,12 @@ async def stream_resume_chat(
chat_id, chat_id,
) )
if stream_result.is_interrupted: if stream_result.is_interrupted:
ot.add_event(
"chat.interrupted",
{
"chat.flow": "resume",
},
)
usage_summary = accumulator.per_message_summary() usage_summary = accumulator.per_message_summary()
_perf_log.info( _perf_log.info(
"[token_usage] interrupted resume_chat: calls=%d total=%d cost_micros=%d summary=%s", "[token_usage] interrupted resume_chat: calls=%d total=%d cost_micros=%d summary=%s",
@ -2815,6 +2957,12 @@ async def stream_resume_chat(
user_message, user_message,
error_extra, error_extra,
) = _classify_stream_exception(e, flow_label="resume") ) = _classify_stream_exception(e, flow_label="resume")
chat_outcome = error_code or error_kind or "error"
chat_error_category = ot_metrics.categorize_exception(e)
with contextlib.suppress(Exception):
chat_span.set_attribute("chat.outcome", chat_outcome)
chat_span.set_attribute("error.category", chat_error_category)
ot.record_error(chat_span, e)
error_message = f"Error during resume: {e!s}" error_message = f"Error during resume: {e!s}"
print(f"[stream_resume_chat] {error_message}") print(f"[stream_resume_chat] {error_message}")
print(f"[stream_resume_chat] Traceback:\n{traceback.format_exc()}") print(f"[stream_resume_chat] Traceback:\n{traceback.format_exc()}")
@ -2964,3 +3112,18 @@ async def stream_resume_chat(
) )
trim_native_heap() trim_native_heap()
log_system_snapshot("stream_resume_chat_END") log_system_snapshot("stream_resume_chat_END")
with contextlib.suppress(Exception):
chat_span.set_attribute("chat.outcome", chat_outcome)
ot_metrics.record_chat_request_duration(
(time.perf_counter() - _t_total) * 1000,
flow="resume",
outcome=chat_outcome,
agent_mode=chat_agent_mode,
)
ot_metrics.record_chat_request_outcome(
flow="resume",
outcome=chat_outcome,
agent_mode=chat_agent_mode,
error_category=chat_error_category,
)
chat_span_cm.__exit__(*sys.exc_info())

View file

@ -16,6 +16,8 @@ import time
from contextlib import asynccontextmanager, contextmanager from contextlib import asynccontextmanager, contextmanager
from typing import Any from typing import Any
from app.observability import metrics as ot_metrics
_perf_log: logging.Logger | None = None _perf_log: logging.Logger | None = None
_last_rss_mb: float = 0.0 _last_rss_mb: float = 0.0
@ -50,6 +52,7 @@ def perf_timer(label: str, *, extra: dict[str, Any] | None = None):
if extra: if extra:
suffix = " " + " ".join(f"{k}={v}" for k, v in extra.items()) suffix = " " + " ".join(f"{k}={v}" for k, v in extra.items())
log.info("%s in %.3fs%s", label, elapsed, suffix) log.info("%s in %.3fs%s", label, elapsed, suffix)
ot_metrics.record_perf_elapsed(elapsed * 1000, label=label)
@asynccontextmanager @asynccontextmanager
@ -68,6 +71,7 @@ async def perf_async_timer(label: str, *, extra: dict[str, Any] | None = None):
if extra: if extra:
suffix = " " + " ".join(f"{k}={v}" for k, v in extra.items()) suffix = " " + " ".join(f"{k}={v}" for k, v in extra.items())
log.info("%s in %.3fs%s", label, elapsed, suffix) log.info("%s in %.3fs%s", label, elapsed, suffix)
ot_metrics.record_perf_elapsed(elapsed * 1000, label=label)
def system_snapshot() -> dict[str, Any]: def system_snapshot() -> dict[str, Any]:

View file

@ -12,9 +12,26 @@ if sys.platform == "win32":
from app.config.uvicorn import load_uvicorn_config from app.config.uvicorn import load_uvicorn_config
_old_log_record_factory = logging.getLogRecordFactory()
def _otel_safe_log_record_factory(*args, **kwargs):
record = _old_log_record_factory(*args, **kwargs)
if not hasattr(record, "otelTraceID"):
record.otelTraceID = "0"
if not hasattr(record, "otelSpanID"):
record.otelSpanID = "0"
return record
logging.setLogRecordFactory(_otel_safe_log_record_factory)
logging.basicConfig( logging.basicConfig(
level=logging.INFO, level=logging.INFO,
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", format=(
"%(asctime)s - %(name)s - %(levelname)s - "
"[trace_id=%(otelTraceID)s span_id=%(otelSpanID)s] %(message)s"
),
datefmt="%Y-%m-%d %H:%M:%S", datefmt="%Y-%m-%d %H:%M:%S",
) )

View file

@ -76,6 +76,17 @@ dependencies = [
"litellm>=1.83.7", "litellm>=1.83.7",
"langchain-litellm>=0.6.4", "langchain-litellm>=0.6.4",
"deepagents>=0.4.12,<0.5", "deepagents>=0.4.12,<0.5",
"opentelemetry-api>=1.40.0",
"opentelemetry-sdk>=1.40.0",
"opentelemetry-exporter-otlp>=1.40.0",
"opentelemetry-semantic-conventions>=0.61b0",
"opentelemetry-instrumentation-fastapi>=0.61b0",
"opentelemetry-instrumentation-sqlalchemy>=0.61b0",
"opentelemetry-instrumentation-psycopg>=0.61b0",
"opentelemetry-instrumentation-redis>=0.61b0",
"opentelemetry-instrumentation-httpx>=0.61b0",
"opentelemetry-instrumentation-celery>=0.61b0",
"opentelemetry-instrumentation-logging>=0.61b0",
] ]
[dependency-groups] [dependency-groups]

View file

@ -23,6 +23,7 @@ pytestmark = pytest.mark.unit
@pytest.fixture(autouse=True) @pytest.fixture(autouse=True)
def _disable_otel(monkeypatch: pytest.MonkeyPatch): def _disable_otel(monkeypatch: pytest.MonkeyPatch):
monkeypatch.delenv("OTEL_EXPORTER_OTLP_ENDPOINT", raising=False) monkeypatch.delenv("OTEL_EXPORTER_OTLP_ENDPOINT", raising=False)
monkeypatch.delenv("OTEL_SDK_DISABLED", raising=False)
monkeypatch.setenv("SURFSENSE_DISABLE_OTEL", "true") monkeypatch.setenv("SURFSENSE_DISABLE_OTEL", "true")
from app.observability import otel as ot from app.observability import otel as ot
@ -99,16 +100,17 @@ class TestAnnotateModelResponse:
"total_tokens": 150, "total_tokens": 150,
}, },
) )
_annotate_model_response(sp, msg) assert _annotate_model_response(sp, msg) == (100, 50)
sp.set_attribute.assert_any_call("tokens.prompt", 100) sp.set_attribute.assert_any_call("gen_ai.usage.input_tokens", 100)
sp.set_attribute.assert_any_call("tokens.completion", 50) sp.set_attribute.assert_any_call("gen_ai.usage.output_tokens", 50)
sp.set_attribute.assert_any_call("tokens.total", 150) sp.set_attribute.assert_any_call("gen_ai.usage.total_tokens", 150)
sp.set_attribute.assert_any_call("gen_ai.operation.name", "chat")
def test_handles_response_with_no_metadata(self) -> None: def test_handles_response_with_no_metadata(self) -> None:
sp = MagicMock() sp = MagicMock()
msg = AIMessage(content="hello") msg = AIMessage(content="hello")
# Should not raise even when usage_metadata is missing # Should not raise even when usage_metadata is missing
_annotate_model_response(sp, msg) assert _annotate_model_response(sp, msg) == (None, None)
class TestAnnotateToolResult: class TestAnnotateToolResult:
@ -119,7 +121,7 @@ class TestAnnotateToolResult:
tool_call_id="abc", tool_call_id="abc",
status="success", status="success",
) )
_annotate_tool_result(sp, result) assert _annotate_tool_result(sp, result) is False
sp.set_attribute.assert_any_call("tool.output.size", len("result text")) sp.set_attribute.assert_any_call("tool.output.size", len("result text"))
sp.set_attribute.assert_any_call("tool.status", "success") sp.set_attribute.assert_any_call("tool.status", "success")
@ -130,7 +132,7 @@ class TestAnnotateToolResult:
tool_call_id="abc", tool_call_id="abc",
additional_kwargs={"error": {"code": "x"}}, additional_kwargs={"error": {"code": "x"}},
) )
_annotate_tool_result(sp, result) assert _annotate_tool_result(sp, result) is True
sp.set_attribute.assert_any_call("tool.error", True) sp.set_attribute.assert_any_call("tool.error", True)
@ -193,3 +195,91 @@ class TestMiddlewareIntegration:
assert result.content == "enabled" assert result.content == "enabled"
finally: finally:
ot.reload_for_tests() ot.reload_for_tests()
async def test_enabled_model_call_records_metrics(
self, monkeypatch: pytest.MonkeyPatch
) -> None:
monkeypatch.delenv("SURFSENSE_DISABLE_OTEL", raising=False)
monkeypatch.setenv("OTEL_EXPORTER_OTLP_ENDPOINT", "http://localhost:4317")
from app.observability import otel as ot
duration_calls: list[dict[str, Any]] = []
token_calls: list[dict[str, Any]] = []
monkeypatch.setattr(
"app.agents.new_chat.middleware.otel_span.ot_metrics.record_model_call_duration",
lambda duration_ms, **attrs: duration_calls.append(
{"duration_ms": duration_ms, **attrs}
),
)
monkeypatch.setattr(
"app.agents.new_chat.middleware.otel_span.ot_metrics.record_model_token_usage",
lambda **attrs: token_calls.append(attrs),
)
ot.reload_for_tests()
try:
mw = OtelSpanMiddleware()
async def handler(req):
return AIMessage(
content="enabled",
usage_metadata={
"input_tokens": 3,
"output_tokens": 5,
"total_tokens": 8,
},
)
request = MagicMock()
request.model = MagicMock()
request.model.model_name = "gpt-4o"
request.model.provider = "openai"
await mw.awrap_model_call(request, handler)
assert duration_calls
assert token_calls == [
{
"input_tokens": 3,
"output_tokens": 5,
"model": "gpt-4o",
"provider": "openai",
}
]
finally:
ot.reload_for_tests()
async def test_enabled_tool_call_records_error_metric(
self, monkeypatch: pytest.MonkeyPatch
) -> None:
monkeypatch.delenv("SURFSENSE_DISABLE_OTEL", raising=False)
monkeypatch.setenv("OTEL_EXPORTER_OTLP_ENDPOINT", "http://localhost:4317")
from app.observability import otel as ot
errors: list[str] = []
monkeypatch.setattr(
"app.agents.new_chat.middleware.otel_span.ot_metrics.record_tool_call_error",
lambda *, tool_name: errors.append(tool_name),
)
monkeypatch.setattr(
"app.agents.new_chat.middleware.otel_span.ot_metrics.record_tool_call_duration",
lambda *args, **kwargs: None,
)
ot.reload_for_tests()
try:
mw = OtelSpanMiddleware()
async def handler(req):
return ToolMessage(
content="failed",
tool_call_id="abc",
status="error",
)
request = MagicMock()
request.tool = MagicMock()
request.tool.name = "web_search"
await mw.awrap_tool_call(request, handler)
assert errors == ["web_search"]
finally:
ot.reload_for_tests()

View file

@ -0,0 +1,101 @@
"""Tests for pure observability helper functions."""
from __future__ import annotations
import pytest
from app.observability import metrics as ot_metrics, otel as ot
pytestmark = pytest.mark.unit
@pytest.fixture(autouse=True)
def _disable_otel(monkeypatch: pytest.MonkeyPatch):
monkeypatch.delenv("OTEL_EXPORTER_OTLP_ENDPOINT", raising=False)
monkeypatch.setenv("SURFSENSE_DISABLE_OTEL", "true")
ot.reload_for_tests()
yield
ot.reload_for_tests()
@pytest.mark.parametrize(
("task_name", "expected"),
[
("reindex_document", "reindex"),
("delete_document_background", "delete"),
("delete_folder_documents_background", "delete"),
("delete_search_space_background", "delete"),
("process_extension_document", "process"),
("process_youtube_video", "process"),
("process_file_upload", "process"),
("process_file_upload_with_document", "process"),
("process_circleback_meeting", "process"),
("generate_video_presentation", "generate"),
("generate_content_podcast", "generate"),
("cleanup_stale_indexing_notifications", "cleanup"),
("reconcile_pending_stripe_page_purchases", "reconcile"),
("reconcile_pending_stripe_token_purchases", "reconcile"),
("check_periodic_schedules", "check"),
("ai_sort_search_space", "ai"),
("index_notion_pages", "index"),
("index_github_repos", "index"),
("index_google_drive_files", "index"),
("index_composio_connector", "index"),
("index_obsidian_attachment", "index"),
("index_local_folder", "index"),
("index_uploaded_folder_files", "index"),
("noseparator", "noseparator"),
("", "unknown"),
],
)
def test_parse_celery_task_label(task_name: str, expected: str) -> None:
assert ot_metrics.parse_celery_task_label(task_name) == expected
def test_parse_celery_task_label_handles_none() -> None:
assert ot_metrics.parse_celery_task_label(None) == "unknown"
@pytest.mark.parametrize(
("exc", "expected"),
[
(type("RateLimitError", (Exception,), {})(), "rate_limited"),
(type("AuthenticationError", (Exception,), {})(), "auth_failed"),
(type("QuotaInsufficientError", (Exception,), {})(), "quota_exhausted"),
(TimeoutError(), "timeout"),
(type("APIConnectionError", (Exception,), {})(), "network_failed"),
(type("ServiceUnavailableError", (Exception,), {})(), "server_error"),
(type("LockContentionError", (Exception,), {})(), "lock_contention"),
(type("UnsupportedFormatError", (Exception,), {})(), "unsupported_format"),
(type("ProviderError", (Exception,), {})(), "provider_error"),
(RuntimeError("plain"), "unknown"),
],
)
def test_categorize_exception(exc: BaseException, expected: str) -> None:
assert ot_metrics.categorize_exception(exc) == expected
def test_record_celery_queue_latency_noops_when_disabled() -> None:
ot_metrics.record_celery_queue_latency(
0.5,
task_name="index_notion_pages",
queue="surfsense.connectors",
scheduled=False,
operation="index",
)
def test_add_event_noops_when_disabled() -> None:
ot.add_event("test.event", {"value": 1})
def test_add_event_noops_without_current_span(monkeypatch: pytest.MonkeyPatch) -> None:
class FakeTrace:
@staticmethod
def get_current_span():
return None
monkeypatch.setattr(ot, "_ENABLED", True)
monkeypatch.setattr(ot, "_ot_trace", FakeTrace())
ot.add_event("test.event", {"value": 1})

View file

@ -4,7 +4,7 @@ from __future__ import annotations
import pytest import pytest
from app.observability import otel from app.observability import bootstrap, metrics, otel
pytestmark = pytest.mark.unit pytestmark = pytest.mark.unit
@ -12,7 +12,14 @@ pytestmark = pytest.mark.unit
@pytest.fixture(autouse=True) @pytest.fixture(autouse=True)
def _reset_otel_state(monkeypatch: pytest.MonkeyPatch): def _reset_otel_state(monkeypatch: pytest.MonkeyPatch):
"""Force a clean OTel disabled state per test, then restore after.""" """Force a clean OTel disabled state per test, then restore after."""
for env in ("OTEL_EXPORTER_OTLP_ENDPOINT", "SURFSENSE_DISABLE_OTEL"): for env in (
"OTEL_EXPORTER_OTLP_ENDPOINT",
"OTEL_EXPORTER_OTLP_PROTOCOL",
"OTEL_EXPORTER_OTLP_TRACES_ENDPOINT",
"OTEL_EXPORTER_OTLP_METRICS_ENDPOINT",
"SURFSENSE_DISABLE_OTEL",
"OTEL_SDK_DISABLED",
):
monkeypatch.delenv(env, raising=False) monkeypatch.delenv(env, raising=False)
monkeypatch.setenv("SURFSENSE_DISABLE_OTEL", "true") monkeypatch.setenv("SURFSENSE_DISABLE_OTEL", "true")
otel.reload_for_tests() otel.reload_for_tests()
@ -36,6 +43,195 @@ def test_kill_switch_overrides_endpoint(monkeypatch: pytest.MonkeyPatch) -> None
assert otel.reload_for_tests() is False assert otel.reload_for_tests() is False
def test_spec_kill_switch_overrides_endpoint(monkeypatch: pytest.MonkeyPatch) -> None:
monkeypatch.delenv("SURFSENSE_DISABLE_OTEL", raising=False)
monkeypatch.setenv("OTEL_EXPORTER_OTLP_ENDPOINT", "http://localhost:4317")
monkeypatch.setenv("OTEL_SDK_DISABLED", "true")
assert otel.reload_for_tests() is False
class TestBootstrapConfig:
def test_disabled_checks_both_kill_switches(
self, monkeypatch: pytest.MonkeyPatch
) -> None:
monkeypatch.delenv("SURFSENSE_DISABLE_OTEL", raising=False)
monkeypatch.delenv("OTEL_SDK_DISABLED", raising=False)
assert bootstrap.is_otel_disabled() is False
monkeypatch.setenv("OTEL_SDK_DISABLED", "on")
assert bootstrap.is_otel_disabled() is True
def test_configured_by_shared_or_signal_endpoint(
self, monkeypatch: pytest.MonkeyPatch
) -> None:
monkeypatch.delenv("SURFSENSE_DISABLE_OTEL", raising=False)
assert bootstrap.is_otel_configured() is False
monkeypatch.setenv(
"OTEL_EXPORTER_OTLP_TRACES_ENDPOINT", "http://localhost:4317"
)
assert bootstrap.is_otel_configured() is True
def test_init_otel_noops_when_disabled(
self, monkeypatch: pytest.MonkeyPatch
) -> None:
called = {"traces": False}
def fake_init_traces(app=None):
del app
called["traces"] = True
monkeypatch.setenv("SURFSENSE_DISABLE_OTEL", "true")
monkeypatch.setenv("OTEL_EXPORTER_OTLP_ENDPOINT", "http://localhost:4317")
monkeypatch.setattr(bootstrap, "init_traces", fake_init_traces)
bootstrap.init_otel()
assert called["traces"] is False
def test_init_otel_dispatches_enabled_signals(
self, monkeypatch: pytest.MonkeyPatch
) -> None:
called: list[str] = []
monkeypatch.delenv("SURFSENSE_DISABLE_OTEL", raising=False)
monkeypatch.setenv("OTEL_EXPORTER_OTLP_ENDPOINT", "http://localhost:4317")
monkeypatch.setattr(
bootstrap, "init_traces", lambda app=None: called.append("traces")
)
monkeypatch.setattr(bootstrap, "init_metrics", lambda: called.append("metrics"))
monkeypatch.setattr(bootstrap, "init_logs", lambda: called.append("logs"))
bootstrap.init_otel()
assert called == ["traces", "metrics", "logs"]
def test_resource_defaults_include_service_metadata(
self, monkeypatch: pytest.MonkeyPatch
) -> None:
monkeypatch.setenv("OTEL_SERVICE_NAME", "custom-backend")
monkeypatch.setenv("SURFSENSE_ENV", "test")
resource = bootstrap._build_resource()
attrs = dict(resource.attributes)
assert attrs["service.name"] == "custom-backend"
assert attrs["deployment.environment.name"] == "test"
assert attrs["deployment.environment"] == "test"
assert attrs["service.instance.id"]
def test_deployment_environment_uses_surfsense_env_only(
self, monkeypatch: pytest.MonkeyPatch
) -> None:
monkeypatch.delenv("SURFSENSE_ENV", raising=False)
assert bootstrap._deployment_environment() == "dev"
monkeypatch.setenv("SURFSENSE_ENV", "production")
assert bootstrap._deployment_environment() == "production"
def test_shutdown_is_safe_without_providers(self) -> None:
bootstrap.shutdown_otel()
def test_init_logs_enables_log_correlation(
self, monkeypatch: pytest.MonkeyPatch
) -> None:
calls: list[dict[str, object]] = []
class FakeLoggingInstrumentor:
def instrument(self, **kwargs: object) -> None:
calls.append(kwargs)
def fake_safe_instrument(name: str, callback):
assert name == "logging"
monkeypatch.setattr(
"opentelemetry.instrumentation.logging.LoggingInstrumentor",
FakeLoggingInstrumentor,
)
callback()
return True
monkeypatch.setattr(bootstrap, "_LOGS_INITIALIZED", False)
monkeypatch.setattr(bootstrap, "_safe_instrument", fake_safe_instrument)
bootstrap.init_logs()
assert calls == [{"set_logging_format": True}]
class TestMetricHelpers:
def test_all_metric_helpers_noop_safely_when_disabled(self) -> None:
metrics.record_model_call_duration(12.5, model="gpt-4o", provider="openai")
metrics.record_model_token_usage(
input_tokens=10,
output_tokens=5,
model="gpt-4o",
provider="openai",
)
metrics.record_tool_call_duration(3.0, tool_name="web_search")
metrics.record_tool_call_error(tool_name="web_search")
metrics.record_kb_search_duration(
4.0,
search_space_id=1,
surface="documents",
)
metrics.record_compaction_run(reason="auto")
metrics.record_permission_ask(permission="write_file")
metrics.record_interrupt(interrupt_type="permission_ask")
metrics.record_indexing_document_duration(1.2, document_type="FILE")
metrics.record_indexing_document_outcome(document_type="FILE", status="success")
metrics.record_connector_sync_duration(
2.3,
connector_type="index_notion_pages",
)
metrics.record_connector_sync_outcome(
connector_type="index_notion_pages",
status="success",
)
metrics.record_auth_failure(reason="UNAUTHORIZED")
metrics.record_rate_limit_rejection(scope="login")
metrics.record_perf_elapsed(7.0, label="[test]")
def test_runtime_observables_register_once(
self, monkeypatch: pytest.MonkeyPatch
) -> None:
class FakeMeter:
def __init__(self) -> None:
self.names: list[str] = []
def create_observable_gauge(self, name: str, **kwargs) -> None:
del kwargs
self.names.append(name)
fake_meter = FakeMeter()
monkeypatch.setattr(metrics, "_OBSERVABLES_REGISTERED", False)
monkeypatch.setattr(metrics, "_is_enabled", lambda: True)
monkeypatch.setattr(metrics, "_get_meter", lambda: fake_meter)
metrics.register_runtime_observables()
metrics.register_runtime_observables()
assert len(fake_meter.names) == 6
assert fake_meter.names.count("python.asyncio.tasks") == 1
monkeypatch.setattr(metrics, "_OBSERVABLES_REGISTERED", False)
def test_log_record_factory_provides_zero_otel_fields() -> None:
import logging
import main # noqa: F401
record = logging.getLogRecordFactory()(
"test",
logging.INFO,
__file__,
1,
"hello",
(),
None,
)
assert record.otelTraceID == "0"
assert record.otelSpanID == "0"
class TestNoopSpansWhenDisabled: class TestNoopSpansWhenDisabled:
def test_generic_span_yields_noop(self) -> None: def test_generic_span_yields_noop(self) -> None:
with otel.span("any.thing", attributes={"x": 1}) as sp: with otel.span("any.thing", attributes={"x": 1}) as sp:

View file

@ -0,0 +1,61 @@
"""Tests for retriever OTel wrappers."""
from __future__ import annotations
from contextlib import contextmanager
from typing import Any
import pytest
from app.retriever.documents_hybrid_search import _instrument_search
pytestmark = pytest.mark.unit
class _Span:
def __init__(self) -> None:
self.attrs: dict[str, Any] = {}
def set_attribute(self, key: str, value: Any) -> None:
self.attrs[key] = value
@contextmanager
def _fake_span(**kwargs):
span = _Span()
span.attrs.update(kwargs)
yield span
@pytest.mark.asyncio
async def test_retriever_wrapper_records_one_span_and_metric(monkeypatch) -> None:
calls: list[dict[str, Any]] = []
monkeypatch.setattr(
"app.retriever.documents_hybrid_search.ot.kb_search_span",
lambda **kwargs: _fake_span(**kwargs),
)
monkeypatch.setattr(
"app.retriever.documents_hybrid_search.ot_metrics.record_kb_search_duration",
lambda duration_ms, **attrs: calls.append(
{"duration_ms": duration_ms, **attrs}
),
)
class Retriever:
@_instrument_search("hybrid")
async def search(
self,
query_text: str,
top_k: int,
search_space_id: int,
) -> list[str]:
del query_text, top_k, search_space_id
return ["doc-1", "doc-2"]
result = await Retriever().search("hello", 3, 42)
assert result == ["doc-1", "doc-2"]
assert len(calls) == 1
assert calls[0]["search_space_id"] == 42
assert calls[0]["surface"] == "documents"

8632
surfsense_backend/uv.lock generated

File diff suppressed because it is too large Load diff

View file

@ -3,9 +3,9 @@ import { useTranslations } from "next-intl";
import { useState } from "react"; import { useState } from "react";
import { Logo } from "@/components/Logo"; import { Logo } from "@/components/Logo";
import { Button } from "@/components/ui/button"; import { Button } from "@/components/ui/button";
import { BACKEND_URL } from "@/lib/env-config";
import { trackLoginAttempt } from "@/lib/posthog/events"; import { trackLoginAttempt } from "@/lib/posthog/events";
import { AmbientBackground } from "./AmbientBackground"; import { AmbientBackground } from "./AmbientBackground";
import { BACKEND_URL } from "@/lib/env-config";
function GoogleGLogo({ className }: { className?: string }) { function GoogleGLogo({ className }: { className?: string }) {
return ( return (

View file

@ -3,16 +3,19 @@
import { ThreadPrimitive } from "@assistant-ui/react"; import { ThreadPrimitive } from "@assistant-ui/react";
import { ArrowDownIcon } from "lucide-react"; import { ArrowDownIcon } from "lucide-react";
import type { FC, ReactNode } from "react"; import type { FC, ReactNode } from "react";
import { TooltipIconButton } from "@/components/assistant-ui/tooltip-icon-button"; import { Button } from "@/components/ui/button";
const ChatScrollToBottom: FC = () => ( const ChatScrollToBottom: FC = () => (
<ThreadPrimitive.ScrollToBottom asChild> <ThreadPrimitive.ScrollToBottom asChild>
<TooltipIconButton <Button
tooltip="Scroll to bottom" type="button"
className="aui-thread-scroll-to-bottom -top-12 absolute z-10 self-center rounded-full border-0 bg-muted p-4 text-foreground hover:bg-accent hover:text-accent-foreground disabled:invisible" variant="ghost"
size="icon"
aria-label="Scroll to bottom"
className="aui-thread-scroll-to-bottom -top-12 absolute z-10 size-10 self-center rounded-full border border-input bg-muted p-0 text-foreground shadow-sm shadow-black/5 hover:bg-accent hover:text-accent-foreground disabled:invisible dark:shadow-black/10"
> >
<ArrowDownIcon /> <ArrowDownIcon />
</TooltipIconButton> </Button>
</ThreadPrimitive.ScrollToBottom> </ThreadPrimitive.ScrollToBottom>
); );

View file

@ -1,7 +1,7 @@
"use client"; "use client";
import { useAtomValue } from "jotai"; import { useAtomValue } from "jotai";
import { AlertTriangle, Settings } from "lucide-react"; import { AlertTriangle } from "lucide-react";
import { useRouter } from "next/navigation"; import { useRouter } from "next/navigation";
import { forwardRef, useEffect, useImperativeHandle, useMemo, useState } from "react"; import { forwardRef, useEffect, useImperativeHandle, useMemo, useState } from "react";
import { createPortal } from "react-dom"; import { createPortal } from "react-dom";
@ -381,7 +381,7 @@ export const ConnectorIndicator = forwardRef<ConnectorIndicatorHandle, Connector
{/* LLM Configuration Warning */} {/* LLM Configuration Warning */}
{!llmConfigLoading && !hasDocumentSummaryLLM && ( {!llmConfigLoading && !hasDocumentSummaryLLM && (
<div className="mb-6"> <div className="mb-6">
<Alert variant="destructive"> <Alert variant="warning">
<AlertTriangle /> <AlertTriangle />
<AlertTitle>LLM Configuration Required</AlertTitle> <AlertTitle>LLM Configuration Required</AlertTitle>
<AlertDescription> <AlertDescription>
@ -392,7 +392,7 @@ export const ConnectorIndicator = forwardRef<ConnectorIndicatorHandle, Connector
</p> </p>
<Button <Button
size="sm" size="sm"
variant="outline" variant="secondary"
onClick={() => { onClick={() => {
handleOpenChange(false); handleOpenChange(false);
router.push( router.push(
@ -400,7 +400,6 @@ export const ConnectorIndicator = forwardRef<ConnectorIndicatorHandle, Connector
); );
}} }}
> >
<Settings className="mr-2 h-4 w-4" />
Go to Settings Go to Settings
</Button> </Button>
</AlertDescription> </AlertDescription>

View file

@ -1,7 +1,7 @@
"use client"; "use client";
import { useAtomValue } from "jotai"; import { useAtomValue } from "jotai";
import { AlertTriangle, Settings } from "lucide-react"; import { AlertTriangle } from "lucide-react";
import { useRouter } from "next/navigation"; import { useRouter } from "next/navigation";
import { import {
createContext, createContext,
@ -148,7 +148,7 @@ const DocumentUploadPopupContent: FC<{
<div className="px-4 sm:px-6 pb-4 sm:pb-6"> <div className="px-4 sm:px-6 pb-4 sm:pb-6">
{!isLoading && !hasDocumentSummaryLLM ? ( {!isLoading && !hasDocumentSummaryLLM ? (
<div className="mb-4"> <div className="mb-4">
<Alert variant="destructive"> <Alert variant="warning">
<AlertTriangle /> <AlertTriangle />
<AlertTitle>LLM Configuration Required</AlertTitle> <AlertTitle>LLM Configuration Required</AlertTitle>
<AlertDescription> <AlertDescription>
@ -159,13 +159,12 @@ const DocumentUploadPopupContent: FC<{
</p> </p>
<Button <Button
size="sm" size="sm"
variant="outline" variant="secondary"
onClick={() => { onClick={() => {
onOpenChange(false); onOpenChange(false);
router.push(`/dashboard/${searchSpaceId}/search-space-settings/models`); router.push(`/dashboard/${searchSpaceId}/search-space-settings/models`);
}} }}
> >
<Settings className="mr-2 h-4 w-4" />
Go to Settings Go to Settings
</Button> </Button>
</AlertDescription> </AlertDescription>

View file

@ -2,14 +2,9 @@
import { Settings, Trash2, Users } from "lucide-react"; import { Settings, Trash2, Users } from "lucide-react";
import { useTranslations } from "next-intl"; import { useTranslations } from "next-intl";
import type { MouseEvent } from "react";
import { useCallback, useRef, useState } from "react"; import { useCallback, useRef, useState } from "react";
import { Button } from "@/components/ui/button"; import { Button } from "@/components/ui/button";
import {
ContextMenu,
ContextMenuContent,
ContextMenuItem,
ContextMenuTrigger,
} from "@/components/ui/context-menu";
import { import {
DropdownMenu, DropdownMenu,
DropdownMenuContent, DropdownMenuContent,
@ -80,19 +75,28 @@ export function SearchSpaceAvatar({
const initials = getInitials(name); const initials = getInitials(name);
const sizeClasses = size === "sm" ? "h-8 w-8 text-xs" : "h-10 w-10 text-sm"; const sizeClasses = size === "sm" ? "h-8 w-8 text-xs" : "h-10 w-10 text-sm";
// Long-press state for mobile const [menuOpen, setMenuOpen] = useState(false);
const [longPressMenuOpen, setLongPressMenuOpen] = useState(false);
const longPressTimer = useRef<ReturnType<typeof setTimeout> | null>(null); const longPressTimer = useRef<ReturnType<typeof setTimeout> | null>(null);
const touchMoved = useRef(false); const touchMoved = useRef(false);
const openMenu = useCallback(() => {
setMenuOpen(true);
}, []);
const handleContextMenu = useCallback((event: MouseEvent<HTMLButtonElement>) => {
event.preventDefault();
event.stopPropagation();
setMenuOpen(true);
}, []);
const handleTouchStart = useCallback(() => { const handleTouchStart = useCallback(() => {
touchMoved.current = false; touchMoved.current = false;
longPressTimer.current = setTimeout(() => { longPressTimer.current = setTimeout(() => {
if (!touchMoved.current) { if (!touchMoved.current) {
setLongPressMenuOpen(true); openMenu();
} }
}, 500); }, 500);
}, []); }, [openMenu]);
const handleTouchMove = useCallback(() => { const handleTouchMove = useCallback(() => {
touchMoved.current = true; touchMoved.current = true;
@ -120,12 +124,26 @@ export function SearchSpaceAvatar({
</div> </div>
); );
const avatarButton = ( const avatarButton = (withMenuHandlers = false) => (
<Button <Button
type="button" type="button"
variant="ghost" variant="ghost"
size="icon" size="icon"
onClick={onClick} onClick={onClick}
onPointerDown={
withMenuHandlers
? (event) => {
if (event.button === 0) {
event.preventDefault();
}
}
: undefined
}
onContextMenu={withMenuHandlers ? handleContextMenu : undefined}
onTouchStart={withMenuHandlers ? handleTouchStart : undefined}
onTouchMove={withMenuHandlers ? handleTouchMove : undefined}
onTouchEnd={withMenuHandlers ? handleTouchEnd : undefined}
onTouchCancel={withMenuHandlers ? handleTouchEnd : undefined}
className={cn( className={cn(
"relative rounded-lg font-semibold text-white transition-all select-none", "relative rounded-lg font-semibold text-white transition-all select-none",
"hover:text-white hover:opacity-90 focus-visible:outline-none focus-visible:ring-2 focus-visible:ring-ring", "hover:text-white hover:opacity-90 focus-visible:outline-none focus-visible:ring-2 focus-visible:ring-ring",
@ -173,73 +191,41 @@ export function SearchSpaceAvatar({
</> </>
); );
// If delete or settings handlers are provided, wrap with context menu // If delete or settings handlers are provided, expose them through a dropdown menu.
if (onDelete || onSettings) { if (onDelete || onSettings) {
// Mobile: use long-press triggered DropdownMenu const trigger = (
if (disableTooltip) {
return (
<DropdownMenu open={longPressMenuOpen} onOpenChange={setLongPressMenuOpen}>
<DropdownMenuTrigger asChild> <DropdownMenuTrigger asChild>
<div {avatarButton(true)}
className="inline-block"
onTouchStart={handleTouchStart}
onTouchMove={handleTouchMove}
onTouchEnd={handleTouchEnd}
onTouchCancel={handleTouchEnd}
>
{avatarButton}
</div>
</DropdownMenuTrigger> </DropdownMenuTrigger>
<DropdownMenuContent>{menuItems}</DropdownMenuContent>
</DropdownMenu>
); );
}
// Desktop: use right-click ContextMenu + Tooltip
return ( return (
<ContextMenu> <DropdownMenu open={menuOpen} onOpenChange={setMenuOpen}>
{disableTooltip ? (
trigger
) : (
<Tooltip> <Tooltip>
<TooltipTrigger asChild> <TooltipTrigger asChild>{trigger}</TooltipTrigger>
<ContextMenuTrigger asChild>
<div className="inline-block">{avatarButton}</div>
</ContextMenuTrigger>
</TooltipTrigger>
<TooltipContent side="right" sideOffset={8}> <TooltipContent side="right" sideOffset={8}>
{tooltipContent} {tooltipContent}
</TooltipContent> </TooltipContent>
</Tooltip> </Tooltip>
<ContextMenuContent>
{onSettings && (
<ContextMenuItem onClick={onSettings}>
<Settings className="mr-2 h-4 w-4" />
{tCommon("settings")}
</ContextMenuItem>
)} )}
{onDelete && isOwner && ( <DropdownMenuContent side="right" align="start">
<ContextMenuItem onClick={onDelete}> {menuItems}
<Trash2 className="mr-2 h-4 w-4" /> </DropdownMenuContent>
{tCommon("delete")} </DropdownMenu>
</ContextMenuItem>
)}
{onDelete && !isOwner && (
<ContextMenuItem onClick={onDelete}>
<Trash2 className="mr-2 h-4 w-4" />
{t("leave")}
</ContextMenuItem>
)}
</ContextMenuContent>
</ContextMenu>
); );
} }
// No context menu needed // No context menu needed
if (disableTooltip) { if (disableTooltip) {
return avatarButton; return avatarButton();
} }
return ( return (
<Tooltip> <Tooltip>
<TooltipTrigger asChild>{avatarButton}</TooltipTrigger> <TooltipTrigger asChild>{avatarButton()}</TooltipTrigger>
<TooltipContent side="right" sideOffset={8}> <TooltipContent side="right" sideOffset={8}>
{tooltipContent} {tooltipContent}
</TooltipContent> </TooltipContent>

View file

@ -14,7 +14,7 @@ const buttonVariants = cva(
"bg-destructive text-white shadow-xs hover:bg-destructive/90 focus-visible:ring-destructive/20 dark:focus-visible:ring-destructive/40", "bg-destructive text-white shadow-xs hover:bg-destructive/90 focus-visible:ring-destructive/20 dark:focus-visible:ring-destructive/40",
outline: outline:
"border border-input bg-background shadow-xs hover:bg-accent hover:text-accent-foreground", "border border-input bg-background shadow-xs hover:bg-accent hover:text-accent-foreground",
secondary: "bg-secondary text-secondary-foreground shadow-xs hover:bg-secondary/80", secondary: "bg-accent text-accent-foreground shadow-xs hover:bg-accent/80",
ghost: ghost:
"hover:bg-accent hover:text-accent-foreground focus-visible:ring-0 focus-visible:ring-offset-0", "hover:bg-accent hover:text-accent-foreground focus-visible:ring-0 focus-visible:ring-offset-0",
link: "text-primary underline-offset-4 hover:underline", link: "text-primary underline-offset-4 hover:underline",

View file

@ -13,6 +13,7 @@
"how-to", "how-to",
"---Developers---", "---Developers---",
"testing", "testing",
"observability",
"code-of-conduct" "code-of-conduct"
] ]
} }

View file

@ -0,0 +1,167 @@
---
title: Observability
description: Configure backend traces and metrics for SurfSense
icon: Radar
---
SurfSense uses OpenTelemetry for backend traces and metrics. Application logs
include trace and span IDs so you can correlate logs with traces, but logs stay
on the normal container stderr path.
## Enable Locally
The development compose file reads backend settings from
`surfsense_backend/.env`. Add these values there:
```dotenv
SURFSENSE_ENABLE_OTEL=true
SURFSENSE_ENV=dev
OTEL_EXPORTER_OTLP_ENDPOINT=http://otel-lgtm:4317
OTEL_EXPORTER_OTLP_PROTOCOL=grpc
OTEL_RESOURCE_ATTRIBUTES=service.namespace=surfsense
OTEL_METRIC_EXPORT_INTERVAL=300000
```
Then start the development stack with the local LGTM backend:
```bash
docker compose -f docker/docker-compose.dev.yml up --build
```
Grafana is exposed on `http://localhost:3001` by default.
## Enable in Production Docker Compose
Production Docker Compose reads backend and collector settings from
`docker/.env`. The API and Celery worker export telemetry to the bundled
collector at `otel-collector:4317`; the collector is the only service that uses
the Grafana Cloud credentials.
Add these values to `docker/.env`:
```dotenv
SURFSENSE_ENV=production
SURFSENSE_ENABLE_OTEL=true
OTEL_EXPORTER_OTLP_ENDPOINT=http://otel-collector:4317
OTEL_EXPORTER_OTLP_PROTOCOL=grpc
OTEL_RESOURCE_ATTRIBUTES=service.namespace=surfsense
OTEL_METRIC_EXPORT_INTERVAL=300000
GRAFANA_CLOUD_OTLP_ENDPOINT=https://otlp-gateway-<region>.grafana.net/otlp
GRAFANA_CLOUD_INSTANCE_ID=<stack instance id>
GRAFANA_CLOUD_API_KEY=<cloud access policy token>
```
Then start the stack:
```bash
docker compose -f docker/docker-compose.yml --profile observability up -d
```
The collector receives OTLP on `otel-collector:4317`, scrubs sensitive span
attributes, applies the configured tail-sampling policy, batches exports,
retries failures, and forwards traces and metrics to Grafana Cloud over OTLP
HTTP.
When deploying `surfsense_backend/Dockerfile` directly instead of production
compose, use the same split: SurfSense containers export to a collector, and
the collector owns the Grafana Cloud credentials.
## Automatic Traces
When OpenTelemetry is enabled, the backend instruments:
- FastAPI inbound requests.
- SQLAlchemy queries from the main async engine and Celery task engine.
- Raw psycopg calls used by the LangGraph checkpointer.
- Redis commands.
- HTTPX outbound requests.
- Celery producer and worker execution.
## Manual Spans
SurfSense keeps project-specific spans behind `app.observability.otel`:
- `model.call`
- `tool.call`
- `chat.request`
- `kb.search`
- `kb.persist`
- `connector.sync`
- `subagent.invoke`
- `etl.extract`
- `etl.parse`
- `etl.ocr`
- `etl.picture.describe`
- `etl.picture.ocr`
- `compaction.run`
- `permission.asked`
- `interrupt.raised`
Keep span names and attributes low-cardinality. Do not attach user content,
prompts, document titles, file paths, user-specific URLs, secrets, or raw
queries as span attributes.
## Metrics
The OpenTelemetry instrumentors provide HTTP, HTTPX, and Celery runtime
metrics. SurfSense adds these project metrics from `app.observability.metrics`:
- `surfsense.model.call.duration`
- `gen_ai.client.token.usage`
- `surfsense.tool.call.duration`
- `surfsense.tool.call.errors`
- `surfsense.chat.request.duration`
- `surfsense.chat.request.outcome`
- `surfsense.kb.search.duration`
- `surfsense.compaction.runs`
- `surfsense.permission.asks`
- `surfsense.interrupt.raised`
- `surfsense.indexing.document.duration`
- `surfsense.indexing.document.outcome`
- `surfsense.connector.sync.duration`
- `surfsense.connector.sync.outcome`
- `surfsense.subagent.invoke.duration`
- `surfsense.subagent.invoke.outcome`
- `surfsense.etl.extract.duration`
- `surfsense.etl.extract.outcome`
- `surfsense.celery.heartbeat.refreshes`
- `surfsense.celery.heartbeat.failures`
- `surfsense.celery.queue.latency`
- `surfsense.auth.failures`
- `surfsense.rate_limit.rejections`
- `surfsense.perf.elapsed_ms`
Runtime gauges include process RSS, CPU utilization, threads, open file
descriptors, asyncio tasks, and CPython GC counters.
## Logs
`LoggingInstrumentor().instrument()` injects `otelTraceID` and `otelSpanID` into
standard Python `LogRecord`s. The root log format writes them as
`trace_id=... span_id=...`.
SurfSense intentionally does not create an OpenTelemetry `LoggerProvider`,
`LoggingHandler`, or `OTLPLogExporter`. Container stderr remains the log
transport.
## Verification
1. Hit a FastAPI endpoint and confirm an inbound server span appears in Grafana.
2. Run a chat request and confirm `model.call` and `tool.call` child spans.
3. Run a knowledge-base search and confirm `kb.search` spans and SQL child spans.
4. Run connector indexing and confirm Celery producer/worker spans share a trace
ID and connector sync metrics increment.
5. Confirm `gen_ai.client.token.usage`, model/tool durations, request duration,
Celery runtime, and runtime gauges appear within one export interval.
6. Confirm logs emitted inside a traced request show non-zero trace and span IDs.
## Out Of Scope
- Frontend/browser OpenTelemetry.
- OpenTelemetry log export.
- Profiling.
- Production backend selection.
- Tail-sampling collector configuration.
- Replacing LangSmith.
- Vendor SDKs.

View file

@ -7,6 +7,7 @@ import {
Download, Download,
FlaskConical, FlaskConical,
Heart, Heart,
Radar,
Unplug, Unplug,
Wrench, Wrench,
} from "lucide-react"; } from "lucide-react";
@ -26,6 +27,7 @@ const DOCS_ICONS: Record<string, React.ComponentType> = {
Download, Download,
FlaskConical, FlaskConical,
Heart, Heart,
Radar,
Unplug, Unplug,
Wrench, Wrench,
}; };