From 8c91c9d76b5a71feb2ece7bc5cdb2d50221c19fd Mon Sep 17 00:00:00 2001 From: Anish Sarkar <104695310+AnishSarkar22@users.noreply.github.com> Date: Thu, 21 May 2026 00:19:59 +0530 Subject: [PATCH 01/28] refactor: replace ContextMenu with DropdownMenu in SearchSpaceAvatar for improved mobile and desktop interactions --- .../layout/ui/icon-rail/SearchSpaceAvatar.tsx | 118 ++++++++---------- 1 file changed, 52 insertions(+), 66 deletions(-) diff --git a/surfsense_web/components/layout/ui/icon-rail/SearchSpaceAvatar.tsx b/surfsense_web/components/layout/ui/icon-rail/SearchSpaceAvatar.tsx index 944536c8f..0aed0db61 100644 --- a/surfsense_web/components/layout/ui/icon-rail/SearchSpaceAvatar.tsx +++ b/surfsense_web/components/layout/ui/icon-rail/SearchSpaceAvatar.tsx @@ -2,14 +2,9 @@ import { Settings, Trash2, Users } from "lucide-react"; import { useTranslations } from "next-intl"; +import type { MouseEvent } from "react"; import { useCallback, useRef, useState } from "react"; import { Button } from "@/components/ui/button"; -import { - ContextMenu, - ContextMenuContent, - ContextMenuItem, - ContextMenuTrigger, -} from "@/components/ui/context-menu"; import { DropdownMenu, DropdownMenuContent, @@ -80,19 +75,28 @@ export function SearchSpaceAvatar({ const initials = getInitials(name); const sizeClasses = size === "sm" ? "h-8 w-8 text-xs" : "h-10 w-10 text-sm"; - // Long-press state for mobile - const [longPressMenuOpen, setLongPressMenuOpen] = useState(false); + const [menuOpen, setMenuOpen] = useState(false); const longPressTimer = useRef | null>(null); const touchMoved = useRef(false); + const openMenu = useCallback(() => { + setMenuOpen(true); + }, []); + + const handleContextMenu = useCallback((event: MouseEvent) => { + event.preventDefault(); + event.stopPropagation(); + setMenuOpen(true); + }, []); + const handleTouchStart = useCallback(() => { touchMoved.current = false; longPressTimer.current = setTimeout(() => { if (!touchMoved.current) { - setLongPressMenuOpen(true); + openMenu(); } }, 500); - }, []); + }, [openMenu]); const handleTouchMove = useCallback(() => { touchMoved.current = true; @@ -120,12 +124,26 @@ export function SearchSpaceAvatar({ ); - const avatarButton = ( + const avatarButton = (withMenuHandlers = false) => ( diff --git a/surfsense_web/components/assistant-ui/document-upload-popup.tsx b/surfsense_web/components/assistant-ui/document-upload-popup.tsx index 88e99e67f..477d7ee77 100644 --- a/surfsense_web/components/assistant-ui/document-upload-popup.tsx +++ b/surfsense_web/components/assistant-ui/document-upload-popup.tsx @@ -1,7 +1,7 @@ "use client"; import { useAtomValue } from "jotai"; -import { AlertTriangle, Settings } from "lucide-react"; +import { AlertTriangle } from "lucide-react"; import { useRouter } from "next/navigation"; import { createContext, @@ -148,7 +148,7 @@ const DocumentUploadPopupContent: FC<{
{!isLoading && !hasDocumentSummaryLLM ? (
- + LLM Configuration Required @@ -159,13 +159,12 @@ const DocumentUploadPopupContent: FC<{

diff --git a/surfsense_web/components/ui/button.tsx b/surfsense_web/components/ui/button.tsx index 66d8e231c..f7fb1f045 100644 --- a/surfsense_web/components/ui/button.tsx +++ b/surfsense_web/components/ui/button.tsx @@ -14,7 +14,7 @@ const buttonVariants = cva( "bg-destructive text-white shadow-xs hover:bg-destructive/90 focus-visible:ring-destructive/20 dark:focus-visible:ring-destructive/40", outline: "border border-input bg-background shadow-xs hover:bg-accent hover:text-accent-foreground", - secondary: "bg-secondary text-secondary-foreground shadow-xs hover:bg-secondary/80", + secondary: "bg-accent text-accent-foreground shadow-xs hover:bg-accent/80", ghost: "hover:bg-accent hover:text-accent-foreground focus-visible:ring-0 focus-visible:ring-offset-0", link: "text-primary underline-offset-4 hover:underline", From 60049936e30867bb47892eb89aae4ccb8d091aac Mon Sep 17 00:00:00 2001 From: Anish Sarkar <104695310+AnishSarkar22@users.noreply.github.com> Date: Thu, 21 May 2026 23:00:56 +0530 Subject: [PATCH 05/28] chore(dev): add local OpenTelemetry backend configuration --- docker/docker-compose.dev.yml | 9 +++++++++ surfsense_backend/.env.example | 11 ++++++++++- 2 files changed, 19 insertions(+), 1 deletion(-) diff --git a/docker/docker-compose.dev.yml b/docker/docker-compose.dev.yml index 53b8ea1a9..58cb7b42f 100644 --- a/docker/docker-compose.dev.yml +++ b/docker/docker-compose.dev.yml @@ -78,6 +78,15 @@ services: timeout: 5s retries: 5 + otel-lgtm: + image: grafana/otel-lgtm:latest + ports: + - "${OTEL_GRPC_PORT:-4317}:4317" + - "${OTEL_HTTP_PORT:-4318}:4318" + - "${OTEL_GRAFANA_PORT:-3001}:3000" + - "${OTEL_TEMPO_PORT:-3200}:3200" + restart: unless-stopped + searxng: image: searxng/searxng:2026.3.13-3c1f68c59 ports: diff --git a/surfsense_backend/.env.example b/surfsense_backend/.env.example index 3d442973c..908f4645d 100644 --- a/surfsense_backend/.env.example +++ b/surfsense_backend/.env.example @@ -303,8 +303,17 @@ LANGSMITH_PROJECT=surfsense # SURFSENSE_ENABLE_BUSY_MUTEX=false # SURFSENSE_ENABLE_LLM_TOOL_SELECTOR=false # adds a per-turn LLM call -# Observability — OTel (also requires OTEL_EXPORTER_OTLP_ENDPOINT) +# Observability - OTel # SURFSENSE_ENABLE_OTEL=false +# OpenTelemetry - endpoint enables export; absent = no-op. +# Production should point at an OTel Collector. For local docker-compose.dev.yml, +# use http://otel-lgtm:4317 instead. +# OTEL_EXPORTER_OTLP_ENDPOINT=http://otel-collector:4317 +# OTEL_EXPORTER_OTLP_PROTOCOL=grpc # or http/protobuf +# OTEL_SERVICE_NAME=surfsense-backend +# OTEL_RESOURCE_ATTRIBUTES=deployment.environment=production +# OTEL_METRIC_EXPORT_INTERVAL=60000 # ms +# OTEL_SDK_DISABLED=false # spec kill-switch # Skills + subagents # SURFSENSE_ENABLE_SKILLS=false From eb2e2b253baa7cb21c0e3558dbe610ea994d4bda Mon Sep 17 00:00:00 2001 From: Anish Sarkar <104695310+AnishSarkar22@users.noreply.github.com> Date: Thu, 21 May 2026 23:01:54 +0530 Subject: [PATCH 06/28] feat(observability): add OpenTelemetry process bootstrap --- surfsense_backend/app/app.py | 39 +- surfsense_backend/app/celery_app.py | 4 + .../app/observability/__init__.py | 2 + .../app/observability/bootstrap.py | 361 ++++++++++++++++++ surfsense_backend/app/observability/otel.py | 5 + .../app/tasks/celery_tasks/__init__.py | 4 + surfsense_backend/main.py | 19 +- 7 files changed, 413 insertions(+), 21 deletions(-) create mode 100644 surfsense_backend/app/observability/bootstrap.py diff --git a/surfsense_backend/app/app.py b/surfsense_backend/app/app.py index fc6242643..43b0af7d2 100644 --- a/surfsense_backend/app/app.py +++ b/surfsense_backend/app/app.py @@ -1,4 +1,5 @@ import asyncio +import contextlib import gc import logging import time @@ -36,13 +37,15 @@ from app.config import ( ) from app.db import User, create_db_and_tables, get_async_session from app.exceptions import GENERIC_5XX_MESSAGE, ISSUES_URL, SurfSenseError +from app.observability import metrics as ot_metrics +from app.observability.bootstrap import init_otel, shutdown_otel from app.rate_limiter import get_real_client_ip, limiter from app.routes import router as crud_router from app.routes.auth_routes import router as auth_router from app.schemas import UserCreate, UserRead, UserUpdate from app.tasks.surfsense_docs_indexer import seed_surfsense_docs from app.users import SECRET, auth_backend, current_active_user, fastapi_users -from app.utils.perf import get_perf_logger, log_system_snapshot +from app.utils.perf import log_system_snapshot _error_logger = logging.getLogger("surfsense.errors") @@ -127,6 +130,8 @@ def _http_exception_handler(request: Request, exc: HTTPException) -> JSONRespons logged server-side. """ rid = _get_request_id(request) + if exc.status_code in {401, 403} and request.url.path.startswith("/auth"): + ot_metrics.record_auth_failure(reason=_status_to_code(exc.status_code)) should_sanitize = exc.status_code == 500 # Structured dict details (e.g. {"code": "CAPTCHA_REQUIRED", "message": "..."}) @@ -213,6 +218,7 @@ def _validation_error_handler( def _unhandled_exception_handler(request: Request, exc: Exception) -> JSONResponse: """Catch-all: log full traceback, return sanitized 500.""" rid = _get_request_id(request) + ot_metrics.record_auth_failure(reason="unhandled_exception") _error_logger.error( "[%s] Unhandled exception on %s %s", rid, @@ -246,6 +252,7 @@ def _status_to_code(status_code: int, detail: str = "") -> str: def _rate_limit_exceeded_handler(request: Request, exc: RateLimitExceeded): """Custom 429 handler that returns JSON matching our error envelope.""" rid = _get_request_id(request) + ot_metrics.record_rate_limit_rejection(scope="slowapi") retry_after = exc.detail.split("per")[-1].strip() if exc.detail else "60" return _build_error_response( 429, @@ -306,6 +313,7 @@ def _check_rate_limit_memory( f"Rate limit exceeded (in-memory fallback) on {scope} for IP {client_ip} " f"({len(timestamps)}/{max_requests} in {window_seconds}s)" ) + ot_metrics.record_rate_limit_rejection(scope=scope) raise HTTPException( status_code=429, detail="RATE_LIMIT_EXCEEDED", @@ -349,6 +357,7 @@ def _check_rate_limit( f"Rate limit exceeded on {scope} for IP {client_ip} " f"({current_count}/{max_requests} in {window_seconds}s)" ) + ot_metrics.record_rate_limit_rejection(scope=scope) raise HTTPException( status_code=429, detail="RATE_LIMIT_EXCEEDED", @@ -558,6 +567,7 @@ async def lifespan(app: FastAPI): gc.set_threshold(700, 10, 5) _enable_slow_callback_logging(threshold_sec=0.5) + init_otel(app) await create_db_and_tables() await setup_checkpointer_tables() initialize_openrouter_integration() @@ -592,6 +602,7 @@ async def lifespan(app: FastAPI): _stop_openrouter_background_refresh() await close_checkpointer() + shutdown_otel() def registration_allowed(): @@ -676,32 +687,20 @@ class RequestPerfMiddleware(BaseHTTPMiddleware): async def dispatch( self, request: StarletteRequest, call_next: RequestResponseEndpoint ) -> StarletteResponse: - perf = get_perf_logger() t0 = time.perf_counter() response = await call_next(request) elapsed_ms = (time.perf_counter() - t0) * 1000 path = request.url.path - method = request.method - status = response.status_code - - perf.debug( - "[request] %s %s -> %d in %.1fms", - method, - path, - status, - elapsed_ms, - ) if elapsed_ms > _PERF_SLOW_REQUEST_THRESHOLD: - perf.warning( - "[SLOW_REQUEST] %s %s -> %d in %.1fms (threshold=%.0fms)", - method, - path, - status, - elapsed_ms, - _PERF_SLOW_REQUEST_THRESHOLD, - ) + with contextlib.suppress(Exception): + from opentelemetry import trace + + span = trace.get_current_span() + span.set_attribute("slow_request", True) + span.set_attribute("surfsense.request.elapsed_ms", elapsed_ms) + span.set_attribute("http.route", path) log_system_snapshot("slow_request") return response diff --git a/surfsense_backend/app/celery_app.py b/surfsense_backend/app/celery_app.py index 74710d5e1..cfb24731d 100644 --- a/surfsense_backend/app/celery_app.py +++ b/surfsense_backend/app/celery_app.py @@ -18,6 +18,10 @@ def init_worker(**kwargs): This ensures the Auto mode (LiteLLM Router) is available for background tasks like document summarization and image generation. """ + from app.observability.bootstrap import init_otel + + init_otel(app=None, traces=True, metrics=True, logs=True) + from app.config import ( initialize_image_gen_router, initialize_llm_router, diff --git a/surfsense_backend/app/observability/__init__.py b/surfsense_backend/app/observability/__init__.py index dbf082561..a675b1dae 100644 --- a/surfsense_backend/app/observability/__init__.py +++ b/surfsense_backend/app/observability/__init__.py @@ -5,3 +5,5 @@ small wrapper around the optional ``opentelemetry`` instrumentation. The wrapper is a no-op when OTEL is not configured, so importing it from performance-critical paths is safe. """ + +__all__ = ["bootstrap", "metrics", "otel"] diff --git a/surfsense_backend/app/observability/bootstrap.py b/surfsense_backend/app/observability/bootstrap.py new file mode 100644 index 000000000..a1b9817da --- /dev/null +++ b/surfsense_backend/app/observability/bootstrap.py @@ -0,0 +1,361 @@ +"""Programmatic OpenTelemetry bootstrap for SurfSense backend processes.""" + +from __future__ import annotations + +import contextlib +import logging +import os +import socket +from importlib import metadata +from typing import Any + +from app.observability import otel + +logger = logging.getLogger(__name__) + +_BOOL_TRUE = {"1", "true", "yes", "on"} + +_TRACES_INITIALIZED = False +_METRICS_INITIALIZED = False +_LOGS_INITIALIZED = False +_FASTAPI_INSTRUMENTED = False +_SQLALCHEMY_INSTRUMENTED = False +_PSYCOPG_INSTRUMENTED = False +_REDIS_INSTRUMENTED = False +_HTTPX_INSTRUMENTED = False +_CELERY_INSTRUMENTED = False + +_TRACER_PROVIDER: Any | None = None +_METER_PROVIDER: Any | None = None + + +def _env_truthy(name: str) -> bool: + return os.environ.get(name, "").strip().lower() in _BOOL_TRUE + + +def is_otel_disabled() -> bool: + """Return true when either SurfSense or OTel's spec kill switch is set.""" + return _env_truthy("SURFSENSE_DISABLE_OTEL") or _env_truthy("OTEL_SDK_DISABLED") + + +def is_otel_configured() -> bool: + """Return true when this process should export OTel signals.""" + return bool( + os.environ.get("OTEL_EXPORTER_OTLP_ENDPOINT") + or os.environ.get("OTEL_EXPORTER_OTLP_TRACES_ENDPOINT") + or os.environ.get("OTEL_EXPORTER_OTLP_METRICS_ENDPOINT") + ) + + +def _package_version() -> str: + with contextlib.suppress(metadata.PackageNotFoundError): + return metadata.version("surf-new-backend") + return "unknown" + + +def _deployment_environment() -> str: + return ( + os.environ.get("SURFSENSE_ENV") + or os.environ.get("APP_ENV") + or os.environ.get("ENVIRONMENT") + or "dev" + ) + + +def _build_resource(): + from opentelemetry.sdk.resources import Resource + + return Resource.create( + { + "service.name": os.environ.get("OTEL_SERVICE_NAME", "surfsense-backend"), + "service.version": _package_version(), + "service.instance.id": socket.gethostname(), + "deployment.environment": _deployment_environment(), + } + ) + + +def _otlp_protocol() -> str: + return os.environ.get("OTEL_EXPORTER_OTLP_PROTOCOL", "grpc").strip().lower() + + +def _trace_exporter(): + if _otlp_protocol() == "http/protobuf": + from opentelemetry.exporter.otlp.proto.http.trace_exporter import ( + OTLPSpanExporter, + ) + + endpoint = os.environ.get("OTEL_EXPORTER_OTLP_TRACES_ENDPOINT") + return OTLPSpanExporter(endpoint=endpoint) if endpoint else OTLPSpanExporter() + + from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter + + endpoint = os.environ.get("OTEL_EXPORTER_OTLP_TRACES_ENDPOINT") + return OTLPSpanExporter(endpoint=endpoint) if endpoint else OTLPSpanExporter() + + +def _metric_exporter(): + if _otlp_protocol() == "http/protobuf": + from opentelemetry.exporter.otlp.proto.http.metric_exporter import ( + OTLPMetricExporter, + ) + + endpoint = os.environ.get("OTEL_EXPORTER_OTLP_METRICS_ENDPOINT") + return ( + OTLPMetricExporter(endpoint=endpoint) if endpoint else OTLPMetricExporter() + ) + + from opentelemetry.exporter.otlp.proto.grpc.metric_exporter import ( + OTLPMetricExporter, + ) + + endpoint = os.environ.get("OTEL_EXPORTER_OTLP_METRICS_ENDPOINT") + return OTLPMetricExporter(endpoint=endpoint) if endpoint else OTLPMetricExporter() + + +def _safe_instrument(name: str, instrument: Any) -> bool: + try: + instrument() + except Exception: + logger.warning("OpenTelemetry %s instrumentation failed", name, exc_info=True) + return False + return True + + +def _instrument_fastapi(app: Any | None) -> None: + global _FASTAPI_INSTRUMENTED + if app is None or _FASTAPI_INSTRUMENTED: + return + + def _run() -> None: + from opentelemetry.instrumentation.fastapi import FastAPIInstrumentor + + FastAPIInstrumentor.instrument_app( + app, + excluded_urls="/health,/ready,/metrics", + ) + + if _safe_instrument("FastAPI", _run): + _FASTAPI_INSTRUMENTED = True + + +def instrument_sqlalchemy_engine(engine: Any) -> None: + """Instrument a SQLAlchemy engine once per process.""" + global _SQLALCHEMY_INSTRUMENTED + if _SQLALCHEMY_INSTRUMENTED: + return + + def _run() -> None: + from opentelemetry.instrumentation.sqlalchemy import SQLAlchemyInstrumentor + + SQLAlchemyInstrumentor().instrument( + engine=getattr(engine, "sync_engine", engine), + enable_commenter=True, + ) + + if _safe_instrument("SQLAlchemy", _run): + _SQLALCHEMY_INSTRUMENTED = True + + +def _instrument_sqlalchemy() -> None: + if _SQLALCHEMY_INSTRUMENTED: + return + with contextlib.suppress(Exception): + from app.db import engine + + instrument_sqlalchemy_engine(engine) + + +def _instrument_psycopg() -> None: + global _PSYCOPG_INSTRUMENTED + if _PSYCOPG_INSTRUMENTED: + return + + def _run() -> None: + from opentelemetry.instrumentation.psycopg import PsycopgInstrumentor + + PsycopgInstrumentor().instrument() + + if _safe_instrument("psycopg", _run): + _PSYCOPG_INSTRUMENTED = True + + +def _instrument_redis() -> None: + global _REDIS_INSTRUMENTED + if _REDIS_INSTRUMENTED: + return + + def _run() -> None: + from opentelemetry.instrumentation.redis import RedisInstrumentor + + RedisInstrumentor().instrument() + + if _safe_instrument("Redis", _run): + _REDIS_INSTRUMENTED = True + + +def _instrument_httpx() -> None: + global _HTTPX_INSTRUMENTED + if _HTTPX_INSTRUMENTED: + return + + def _run() -> None: + from opentelemetry.instrumentation.httpx import HTTPXClientInstrumentor + + HTTPXClientInstrumentor().instrument() + + if _safe_instrument("HTTPX", _run): + _HTTPX_INSTRUMENTED = True + + +def instrument_celery() -> None: + """Instrument Celery producer/consumer hooks once per process.""" + global _CELERY_INSTRUMENTED + if _CELERY_INSTRUMENTED: + return + + def _run() -> None: + from opentelemetry.instrumentation.celery import CeleryInstrumentor + + CeleryInstrumentor().instrument() + + if _safe_instrument("Celery", _run): + _CELERY_INSTRUMENTED = True + + +def _instrument_libraries(app: Any | None) -> None: + _instrument_fastapi(app) + _instrument_sqlalchemy() + _instrument_psycopg() + _instrument_redis() + _instrument_httpx() + instrument_celery() + + +def init_traces(app: Any | None = None) -> None: + """Install the tracer provider, span processor, exporter, and instrumentors.""" + global _TRACER_PROVIDER, _TRACES_INITIALIZED + if _TRACES_INITIALIZED: + _instrument_fastapi(app) + return + + from opentelemetry import trace + from opentelemetry.sdk.trace import TracerProvider + from opentelemetry.sdk.trace.export import BatchSpanProcessor + from opentelemetry.sdk.trace.sampling import ALWAYS_ON, ParentBased + + provider = TracerProvider( + resource=_build_resource(), + sampler=ParentBased(ALWAYS_ON), + ) + provider.add_span_processor(BatchSpanProcessor(_trace_exporter())) + + try: + trace.set_tracer_provider(provider) + except Exception: + logger.warning( + "OpenTelemetry tracer provider was already set; reusing existing provider", + exc_info=True, + ) + _TRACER_PROVIDER = trace.get_tracer_provider() + else: + _TRACER_PROVIDER = provider + + _TRACES_INITIALIZED = True + otel.reload_for_tests() + _instrument_libraries(app) + + +def init_metrics() -> None: + """Install the meter provider, metric reader, exporter, and custom gauges.""" + global _METER_PROVIDER, _METRICS_INITIALIZED + if _METRICS_INITIALIZED: + return + + from opentelemetry import metrics + from opentelemetry.sdk.metrics import MeterProvider + from opentelemetry.sdk.metrics.export import PeriodicExportingMetricReader + + interval_ms = int(os.environ.get("OTEL_METRIC_EXPORT_INTERVAL", "60000")) + reader = PeriodicExportingMetricReader( + _metric_exporter(), + export_interval_millis=interval_ms, + ) + provider = MeterProvider(metric_readers=[reader], resource=_build_resource()) + + try: + metrics.set_meter_provider(provider) + except Exception: + logger.warning( + "OpenTelemetry meter provider was already set; reusing existing provider", + exc_info=True, + ) + _METER_PROVIDER = metrics.get_meter_provider() + else: + _METER_PROVIDER = provider + + _METRICS_INITIALIZED = True + from app.observability.metrics import register_runtime_observables + + register_runtime_observables() + + +def init_logs() -> None: + """Enable trace/span correlation fields on stdlib LogRecords.""" + global _LOGS_INITIALIZED + if _LOGS_INITIALIZED: + return + + def _run() -> None: + from opentelemetry.instrumentation.logging import LoggingInstrumentor + + LoggingInstrumentor().instrument() + + if _safe_instrument("logging", _run): + _LOGS_INITIALIZED = True + + +def init_otel( + app: Any | None = None, + *, + traces: bool = True, + metrics: bool = True, + logs: bool = True, +) -> None: + """Initialize OpenTelemetry for a FastAPI or Celery process.""" + if is_otel_disabled() or not is_otel_configured(): + otel.reload_for_tests() + return + + if traces: + init_traces(app) + if metrics: + init_metrics() + if logs: + init_logs() + + +def shutdown_otel(timeout_millis: int = 5000) -> None: + """Best-effort flush and shutdown for installed providers.""" + for provider in (_TRACER_PROVIDER, _METER_PROVIDER): + if provider is None: + continue + with contextlib.suppress(Exception): + provider.force_flush(timeout_millis=timeout_millis) + with contextlib.suppress(Exception): + provider.shutdown() + + +__all__ = [ + "_BOOL_TRUE", + "_build_resource", + "init_logs", + "init_metrics", + "init_otel", + "init_traces", + "instrument_celery", + "instrument_sqlalchemy_engine", + "is_otel_configured", + "is_otel_disabled", + "shutdown_otel", +] diff --git a/surfsense_backend/app/observability/otel.py b/surfsense_backend/app/observability/otel.py index 6791ab499..f39cfe535 100644 --- a/surfsense_backend/app/observability/otel.py +++ b/surfsense_backend/app/observability/otel.py @@ -66,6 +66,8 @@ def _resolve_enabled() -> bool: # Honor an explicit kill-switch first. if os.environ.get("SURFSENSE_DISABLE_OTEL", "").lower() in {"1", "true", "yes"}: return False + if os.environ.get("OTEL_SDK_DISABLED", "").lower() in {"1", "true", "yes", "on"}: + return False # Treat a configured endpoint as the canonical "OTel is wired up" signal. if os.environ.get("OTEL_EXPORTER_OTLP_ENDPOINT"): return True @@ -198,8 +200,11 @@ def model_call_span( attrs: dict[str, Any] = {} if model_id: attrs["model.id"] = model_id + attrs["gen_ai.request.model"] = model_id if provider: attrs["model.provider"] = provider + attrs["gen_ai.provider.name"] = provider + attrs["gen_ai.operation.name"] = "chat" if extra: attrs.update(extra) return span("model.call", attributes=attrs) diff --git a/surfsense_backend/app/tasks/celery_tasks/__init__.py b/surfsense_backend/app/tasks/celery_tasks/__init__.py index b23359f36..6ea7a2e68 100644 --- a/surfsense_backend/app/tasks/celery_tasks/__init__.py +++ b/surfsense_backend/app/tasks/celery_tasks/__init__.py @@ -37,6 +37,10 @@ def get_celery_session_maker() -> async_sessionmaker: poolclass=NullPool, echo=False, ) + with contextlib.suppress(Exception): + from app.observability.bootstrap import instrument_sqlalchemy_engine + + instrument_sqlalchemy_engine(_celery_engine) _celery_session_maker = async_sessionmaker( _celery_engine, expire_on_commit=False ) diff --git a/surfsense_backend/main.py b/surfsense_backend/main.py index 4a7a9b7b1..54911a34d 100644 --- a/surfsense_backend/main.py +++ b/surfsense_backend/main.py @@ -12,9 +12,26 @@ if sys.platform == "win32": from app.config.uvicorn import load_uvicorn_config +_old_log_record_factory = logging.getLogRecordFactory() + + +def _otel_safe_log_record_factory(*args, **kwargs): + record = _old_log_record_factory(*args, **kwargs) + if not hasattr(record, "otelTraceID"): + record.otelTraceID = "0" + if not hasattr(record, "otelSpanID"): + record.otelSpanID = "0" + return record + + +logging.setLogRecordFactory(_otel_safe_log_record_factory) + logging.basicConfig( level=logging.INFO, - format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", + format=( + "%(asctime)s - %(name)s - %(levelname)s - " + "[trace_id=%(otelTraceID)s span_id=%(otelSpanID)s] %(message)s" + ), datefmt="%Y-%m-%d %H:%M:%S", ) From 6095b48b5f06707f53876c76b801945f6644c5c5 Mon Sep 17 00:00:00 2001 From: Anish Sarkar <104695310+AnishSarkar22@users.noreply.github.com> Date: Thu, 21 May 2026 23:02:20 +0530 Subject: [PATCH 07/28] feat(observability): add SurfSense metric helpers --- .../app/observability/metrics.py | 416 ++++++++++++++++++ surfsense_backend/app/utils/perf.py | 4 + .../tests/unit/observability/test_otel.py | 163 ++++++- 3 files changed, 581 insertions(+), 2 deletions(-) create mode 100644 surfsense_backend/app/observability/metrics.py diff --git a/surfsense_backend/app/observability/metrics.py b/surfsense_backend/app/observability/metrics.py new file mode 100644 index 000000000..fb99b4cc8 --- /dev/null +++ b/surfsense_backend/app/observability/metrics.py @@ -0,0 +1,416 @@ +"""Custom OpenTelemetry metrics for SurfSense. + +This module owns all SurfSense-specific metric instruments. Callers use the +small helper functions below instead of constructing instruments directly so +attribute names and cardinality stay consistent across the backend. +""" + +from __future__ import annotations + +import contextlib +import gc +import logging +from functools import lru_cache +from importlib import metadata +from typing import Any + +from app.observability import otel + +logger = logging.getLogger(__name__) + +_INSTRUMENTATION_NAME = "surfsense.platform" +_OBSERVABLES_REGISTERED = False + + +def _package_version() -> str: + with contextlib.suppress(metadata.PackageNotFoundError): + return metadata.version("surf-new-backend") + return "unknown" + + +def _is_enabled() -> bool: + return otel.is_enabled() + + +def _clean_attrs(attrs: dict[str, Any]) -> dict[str, str | int | float | bool]: + """Drop empty values and coerce low-cardinality attrs to OTel-safe scalars.""" + cleaned: dict[str, str | int | float | bool] = {} + for key, value in attrs.items(): + if value is None: + continue + if isinstance(value, bool | int | float): + cleaned[key] = value + continue + text = str(value) + if text: + cleaned[key] = text + return cleaned + + +def _record(callable_obj: Any, value: int | float, attrs: dict[str, Any]) -> None: + if not _is_enabled(): + return + with contextlib.suppress(Exception): + callable_obj.record(value, _clean_attrs(attrs)) + + +def _add(callable_obj: Any, value: int, attrs: dict[str, Any]) -> None: + if not _is_enabled(): + return + with contextlib.suppress(Exception): + callable_obj.add(value, _clean_attrs(attrs)) + + +@lru_cache(maxsize=1) +def _get_meter(): + from opentelemetry import metrics + + return metrics.get_meter(_INSTRUMENTATION_NAME, _package_version()) + + +@lru_cache(maxsize=1) +def _model_call_duration(): + return _get_meter().create_histogram( + "surfsense.model.call.duration", + unit="ms", + description="Duration of SurfSense LLM model calls.", + ) + + +@lru_cache(maxsize=1) +def _model_token_usage(): + return _get_meter().create_histogram( + "gen_ai.client.token.usage", + unit="{token}", + description="Token usage reported by GenAI model responses.", + ) + + +@lru_cache(maxsize=1) +def _tool_call_duration(): + return _get_meter().create_histogram( + "surfsense.tool.call.duration", + unit="ms", + description="Duration of SurfSense agent tool calls.", + ) + + +@lru_cache(maxsize=1) +def _tool_call_errors(): + return _get_meter().create_counter( + "surfsense.tool.call.errors", + description="Count of SurfSense agent tool call errors.", + ) + + +@lru_cache(maxsize=1) +def _kb_search_duration(): + return _get_meter().create_histogram( + "surfsense.kb.search.duration", + unit="ms", + description="Duration of SurfSense knowledge-base search calls.", + ) + + +@lru_cache(maxsize=1) +def _compaction_runs(): + return _get_meter().create_counter( + "surfsense.compaction.runs", + description="Count of SurfSense conversation compaction runs.", + ) + + +@lru_cache(maxsize=1) +def _permission_asks(): + return _get_meter().create_counter( + "surfsense.permission.asks", + description="Count of SurfSense permission asks.", + ) + + +@lru_cache(maxsize=1) +def _interrupts(): + return _get_meter().create_counter( + "surfsense.interrupt.raised", + description="Count of SurfSense interrupts raised.", + ) + + +@lru_cache(maxsize=1) +def _indexing_document_duration(): + return _get_meter().create_histogram( + "surfsense.indexing.document.duration", + unit="s", + description="Duration of SurfSense document indexing.", + ) + + +@lru_cache(maxsize=1) +def _indexing_document_outcome(): + return _get_meter().create_counter( + "surfsense.indexing.document.outcome", + description="Count of SurfSense document indexing outcomes.", + ) + + +@lru_cache(maxsize=1) +def _connector_sync_duration(): + return _get_meter().create_histogram( + "surfsense.connector.sync.duration", + unit="s", + description="Duration of SurfSense connector sync tasks.", + ) + + +@lru_cache(maxsize=1) +def _connector_sync_outcome(): + return _get_meter().create_counter( + "surfsense.connector.sync.outcome", + description="Count of SurfSense connector sync outcomes.", + ) + + +@lru_cache(maxsize=1) +def _auth_failures(): + return _get_meter().create_counter( + "surfsense.auth.failures", + description="Count of SurfSense authentication failures.", + ) + + +@lru_cache(maxsize=1) +def _rate_limit_rejections(): + return _get_meter().create_counter( + "surfsense.rate_limit.rejections", + description="Count of SurfSense rate-limit rejections.", + ) + + +@lru_cache(maxsize=1) +def _perf_elapsed(): + return _get_meter().create_histogram( + "surfsense.perf.elapsed_ms", + unit="ms", + description="Elapsed time recorded by SurfSense perf timers.", + ) + + +def record_model_call_duration( + duration_ms: float, *, model: str | None, provider: str | None +) -> None: + _record( + _model_call_duration(), + duration_ms, + { + "gen_ai.request.model": model, + "gen_ai.provider.name": provider, + }, + ) + + +def record_model_token_usage( + *, + input_tokens: int | None, + output_tokens: int | None, + model: str | None, + provider: str | None, +) -> None: + base = { + "gen_ai.request.model": model, + "gen_ai.provider.name": provider, + "gen_ai.operation.name": "chat", + } + if input_tokens is not None: + _record( + _model_token_usage(), + int(input_tokens), + {**base, "gen_ai.token.type": "input"}, + ) + if output_tokens is not None: + _record( + _model_token_usage(), + int(output_tokens), + {**base, "gen_ai.token.type": "output"}, + ) + + +def record_tool_call_duration(duration_ms: float, *, tool_name: str) -> None: + _record(_tool_call_duration(), duration_ms, {"tool.name": tool_name}) + + +def record_tool_call_error(*, tool_name: str) -> None: + _add(_tool_call_errors(), 1, {"tool.name": tool_name}) + + +def record_kb_search_duration( + duration_ms: float, *, search_space_id: int | None, surface: str +) -> None: + _record( + _kb_search_duration(), + duration_ms, + {"search_space.id": search_space_id, "search.surface": surface}, + ) + + +def record_compaction_run(*, reason: str | None) -> None: + _add(_compaction_runs(), 1, {"compaction.reason": reason or "unknown"}) + + +def record_permission_ask(*, permission: str) -> None: + _add(_permission_asks(), 1, {"permission.permission": permission}) + + +def record_interrupt(*, interrupt_type: str) -> None: + _add(_interrupts(), 1, {"interrupt.type": interrupt_type}) + + +def record_indexing_document_duration( + duration_s: float, *, document_type: str | None +) -> None: + _record( + _indexing_document_duration(), + duration_s, + {"document.type": document_type or "unknown"}, + ) + + +def record_indexing_document_outcome(*, document_type: str | None, status: str) -> None: + _add( + _indexing_document_outcome(), + 1, + {"document.type": document_type or "unknown", "status": status}, + ) + + +def record_connector_sync_duration( + duration_s: float, *, connector_type: str | None +) -> None: + _record( + _connector_sync_duration(), + duration_s, + {"connector.type": connector_type or "unknown"}, + ) + + +def record_connector_sync_outcome(*, connector_type: str | None, status: str) -> None: + _add( + _connector_sync_outcome(), + 1, + {"connector.type": connector_type or "unknown", "status": status}, + ) + + +def record_auth_failure(*, reason: str) -> None: + _add(_auth_failures(), 1, {"reason": reason}) + + +def record_rate_limit_rejection(*, scope: str) -> None: + _add(_rate_limit_rejections(), 1, {"scope": scope}) + + +def record_perf_elapsed(duration_ms: float, *, label: str) -> None: + _record(_perf_elapsed(), duration_ms, {"label": label}) + + +def _runtime_snapshot_value(key: str, transform: Any = None) -> list[Any]: + from opentelemetry.metrics import Observation + + from app.utils.perf import system_snapshot + + snap = system_snapshot() + value = snap.get(key) + if not isinstance(value, int | float) or value < 0: + return [] + if transform is not None: + value = transform(value) + return [Observation(value)] + + +def _observe_gc_collections(_options: Any) -> list[Any]: + from opentelemetry.metrics import Observation + + return [ + Observation(count, {"generation": str(generation)}) + for generation, count in enumerate(gc.get_count()) + ] + + +def register_runtime_observables() -> None: + """Register process/runtime observable gauges once per process.""" + global _OBSERVABLES_REGISTERED + if _OBSERVABLES_REGISTERED or not _is_enabled(): + return + + meter = _get_meter() + try: + # Each callback returns the value for a single gauge except GC, whose + # callback carries a generation attribute. + meter.create_observable_gauge( + "process.runtime.cpython.memory.rss", + callbacks=[ + lambda _options: _runtime_snapshot_value( + "rss_mb", lambda v: float(v) * 1024 * 1024 + ) + ], + unit="By", + description="Resident set size of the SurfSense backend process.", + ) + meter.create_observable_gauge( + "process.runtime.cpython.cpu.utilization", + callbacks=[ + lambda _options: _runtime_snapshot_value( + "cpu_percent", lambda v: float(v) / 100.0 + ) + ], + unit="1", + description="CPU utilization of the SurfSense backend process.", + ) + meter.create_observable_gauge( + "process.runtime.cpython.threads", + callbacks=[lambda _options: _runtime_snapshot_value("threads")], + unit="{thread}", + description="Thread count of the SurfSense backend process.", + ) + meter.create_observable_gauge( + "process.runtime.cpython.open_fds", + callbacks=[lambda _options: _runtime_snapshot_value("open_fds")], + unit="{fd}", + description="Open file descriptor count of the SurfSense backend process.", + ) + meter.create_observable_gauge( + "python.asyncio.tasks", + callbacks=[lambda _options: _runtime_snapshot_value("asyncio_tasks")], + unit="{task}", + description="Live asyncio task count in the current event loop.", + ) + meter.create_observable_gauge( + "process.runtime.cpython.gc.collections", + callbacks=[_observe_gc_collections], + unit="{collection}", + description="CPython GC counters by generation.", + ) + except Exception: + logger.warning("Failed to register OTel runtime observables", exc_info=True) + return + + _OBSERVABLES_REGISTERED = True + + +__all__ = [ + "record_auth_failure", + "record_compaction_run", + "record_connector_sync_duration", + "record_connector_sync_outcome", + "record_indexing_document_duration", + "record_indexing_document_outcome", + "record_interrupt", + "record_kb_search_duration", + "record_model_call_duration", + "record_model_token_usage", + "record_perf_elapsed", + "record_permission_ask", + "record_rate_limit_rejection", + "record_tool_call_duration", + "record_tool_call_error", + "register_runtime_observables", +] diff --git a/surfsense_backend/app/utils/perf.py b/surfsense_backend/app/utils/perf.py index b2b26897c..541ee5756 100644 --- a/surfsense_backend/app/utils/perf.py +++ b/surfsense_backend/app/utils/perf.py @@ -16,6 +16,8 @@ import time from contextlib import asynccontextmanager, contextmanager from typing import Any +from app.observability import metrics as ot_metrics + _perf_log: logging.Logger | None = None _last_rss_mb: float = 0.0 @@ -50,6 +52,7 @@ def perf_timer(label: str, *, extra: dict[str, Any] | None = None): if extra: suffix = " " + " ".join(f"{k}={v}" for k, v in extra.items()) log.info("%s in %.3fs%s", label, elapsed, suffix) + ot_metrics.record_perf_elapsed(elapsed * 1000, label=label) @asynccontextmanager @@ -68,6 +71,7 @@ async def perf_async_timer(label: str, *, extra: dict[str, Any] | None = None): if extra: suffix = " " + " ".join(f"{k}={v}" for k, v in extra.items()) log.info("%s in %.3fs%s", label, elapsed, suffix) + ot_metrics.record_perf_elapsed(elapsed * 1000, label=label) def system_snapshot() -> dict[str, Any]: diff --git a/surfsense_backend/tests/unit/observability/test_otel.py b/surfsense_backend/tests/unit/observability/test_otel.py index fc5813973..be40cccde 100644 --- a/surfsense_backend/tests/unit/observability/test_otel.py +++ b/surfsense_backend/tests/unit/observability/test_otel.py @@ -4,7 +4,7 @@ from __future__ import annotations import pytest -from app.observability import otel +from app.observability import bootstrap, metrics, otel pytestmark = pytest.mark.unit @@ -12,7 +12,14 @@ pytestmark = pytest.mark.unit @pytest.fixture(autouse=True) def _reset_otel_state(monkeypatch: pytest.MonkeyPatch): """Force a clean OTel disabled state per test, then restore after.""" - for env in ("OTEL_EXPORTER_OTLP_ENDPOINT", "SURFSENSE_DISABLE_OTEL"): + for env in ( + "OTEL_EXPORTER_OTLP_ENDPOINT", + "OTEL_EXPORTER_OTLP_PROTOCOL", + "OTEL_EXPORTER_OTLP_TRACES_ENDPOINT", + "OTEL_EXPORTER_OTLP_METRICS_ENDPOINT", + "SURFSENSE_DISABLE_OTEL", + "OTEL_SDK_DISABLED", + ): monkeypatch.delenv(env, raising=False) monkeypatch.setenv("SURFSENSE_DISABLE_OTEL", "true") otel.reload_for_tests() @@ -36,6 +43,158 @@ def test_kill_switch_overrides_endpoint(monkeypatch: pytest.MonkeyPatch) -> None assert otel.reload_for_tests() is False +def test_spec_kill_switch_overrides_endpoint(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.delenv("SURFSENSE_DISABLE_OTEL", raising=False) + monkeypatch.setenv("OTEL_EXPORTER_OTLP_ENDPOINT", "http://localhost:4317") + monkeypatch.setenv("OTEL_SDK_DISABLED", "true") + assert otel.reload_for_tests() is False + + +class TestBootstrapConfig: + def test_disabled_checks_both_kill_switches( + self, monkeypatch: pytest.MonkeyPatch + ) -> None: + monkeypatch.delenv("SURFSENSE_DISABLE_OTEL", raising=False) + monkeypatch.delenv("OTEL_SDK_DISABLED", raising=False) + assert bootstrap.is_otel_disabled() is False + + monkeypatch.setenv("OTEL_SDK_DISABLED", "on") + assert bootstrap.is_otel_disabled() is True + + def test_configured_by_shared_or_signal_endpoint( + self, monkeypatch: pytest.MonkeyPatch + ) -> None: + monkeypatch.delenv("SURFSENSE_DISABLE_OTEL", raising=False) + assert bootstrap.is_otel_configured() is False + + monkeypatch.setenv( + "OTEL_EXPORTER_OTLP_TRACES_ENDPOINT", "http://localhost:4317" + ) + assert bootstrap.is_otel_configured() is True + + def test_init_otel_noops_when_disabled( + self, monkeypatch: pytest.MonkeyPatch + ) -> None: + called = {"traces": False} + + def fake_init_traces(app=None): + del app + called["traces"] = True + + monkeypatch.setenv("SURFSENSE_DISABLE_OTEL", "true") + monkeypatch.setenv("OTEL_EXPORTER_OTLP_ENDPOINT", "http://localhost:4317") + monkeypatch.setattr(bootstrap, "init_traces", fake_init_traces) + + bootstrap.init_otel() + assert called["traces"] is False + + def test_init_otel_dispatches_enabled_signals( + self, monkeypatch: pytest.MonkeyPatch + ) -> None: + called: list[str] = [] + + monkeypatch.delenv("SURFSENSE_DISABLE_OTEL", raising=False) + monkeypatch.setenv("OTEL_EXPORTER_OTLP_ENDPOINT", "http://localhost:4317") + monkeypatch.setattr( + bootstrap, "init_traces", lambda app=None: called.append("traces") + ) + monkeypatch.setattr(bootstrap, "init_metrics", lambda: called.append("metrics")) + monkeypatch.setattr(bootstrap, "init_logs", lambda: called.append("logs")) + + bootstrap.init_otel() + assert called == ["traces", "metrics", "logs"] + + def test_resource_defaults_include_service_metadata( + self, monkeypatch: pytest.MonkeyPatch + ) -> None: + monkeypatch.setenv("OTEL_SERVICE_NAME", "custom-backend") + monkeypatch.setenv("SURFSENSE_ENV", "test") + + resource = bootstrap._build_resource() + attrs = dict(resource.attributes) + assert attrs["service.name"] == "custom-backend" + assert attrs["deployment.environment"] == "test" + assert attrs["service.instance.id"] + + def test_shutdown_is_safe_without_providers(self) -> None: + bootstrap.shutdown_otel() + + +class TestMetricHelpers: + def test_all_metric_helpers_noop_safely_when_disabled(self) -> None: + metrics.record_model_call_duration(12.5, model="gpt-4o", provider="openai") + metrics.record_model_token_usage( + input_tokens=10, + output_tokens=5, + model="gpt-4o", + provider="openai", + ) + metrics.record_tool_call_duration(3.0, tool_name="web_search") + metrics.record_tool_call_error(tool_name="web_search") + metrics.record_kb_search_duration( + 4.0, + search_space_id=1, + surface="documents", + ) + metrics.record_compaction_run(reason="auto") + metrics.record_permission_ask(permission="write_file") + metrics.record_interrupt(interrupt_type="permission_ask") + metrics.record_indexing_document_duration(1.2, document_type="FILE") + metrics.record_indexing_document_outcome(document_type="FILE", status="success") + metrics.record_connector_sync_duration( + 2.3, + connector_type="index_notion_pages", + ) + metrics.record_connector_sync_outcome( + connector_type="index_notion_pages", + status="success", + ) + metrics.record_auth_failure(reason="UNAUTHORIZED") + metrics.record_rate_limit_rejection(scope="login") + metrics.record_perf_elapsed(7.0, label="[test]") + + def test_runtime_observables_register_once( + self, monkeypatch: pytest.MonkeyPatch + ) -> None: + class FakeMeter: + def __init__(self) -> None: + self.names: list[str] = [] + + def create_observable_gauge(self, name: str, **kwargs) -> None: + del kwargs + self.names.append(name) + + fake_meter = FakeMeter() + monkeypatch.setattr(metrics, "_OBSERVABLES_REGISTERED", False) + monkeypatch.setattr(metrics, "_is_enabled", lambda: True) + monkeypatch.setattr(metrics, "_get_meter", lambda: fake_meter) + + metrics.register_runtime_observables() + metrics.register_runtime_observables() + + assert len(fake_meter.names) == 6 + assert fake_meter.names.count("python.asyncio.tasks") == 1 + monkeypatch.setattr(metrics, "_OBSERVABLES_REGISTERED", False) + + +def test_log_record_factory_provides_zero_otel_fields() -> None: + import logging + + import main # noqa: F401 + + record = logging.getLogRecordFactory()( + "test", + logging.INFO, + __file__, + 1, + "hello", + (), + None, + ) + assert record.otelTraceID == "0" + assert record.otelSpanID == "0" + + class TestNoopSpansWhenDisabled: def test_generic_span_yields_noop(self) -> None: with otel.span("any.thing", attributes={"x": 1}) as sp: From ea3d0a64638c1bac1e3c6585fb2d53b7e6e17373 Mon Sep 17 00:00:00 2001 From: Anish Sarkar <104695310+AnishSarkar22@users.noreply.github.com> Date: Thu, 21 May 2026 23:02:36 +0530 Subject: [PATCH 08/28] feat(agents): emit metrics for model and tool calls --- .../agents/new_chat/middleware/otel_span.py | 103 +++++++++++++++-- .../unit/agents/new_chat/test_otel_span.py | 104 ++++++++++++++++-- 2 files changed, 190 insertions(+), 17 deletions(-) diff --git a/surfsense_backend/app/agents/new_chat/middleware/otel_span.py b/surfsense_backend/app/agents/new_chat/middleware/otel_span.py index cfe1edae4..ecaa042a9 100644 --- a/surfsense_backend/app/agents/new_chat/middleware/otel_span.py +++ b/surfsense_backend/app/agents/new_chat/middleware/otel_span.py @@ -16,13 +16,14 @@ dashboards expect. from __future__ import annotations import logging +import time from collections.abc import Awaitable, Callable from typing import TYPE_CHECKING, Any from langchain.agents.middleware import AgentMiddleware from langchain_core.messages import AIMessage, ToolMessage -from app.observability import otel as ot +from app.observability import metrics as ot_metrics, otel as ot if TYPE_CHECKING: # pragma: no cover — type-only from langchain.agents.middleware.types import ( @@ -62,14 +63,37 @@ class OtelSpanMiddleware(AgentMiddleware): return await handler(request) model_id, provider = _resolve_model_attrs(request) + t0 = time.perf_counter() with ot.model_call_span(model_id=model_id, provider=provider) as sp: + _annotate_model_request(sp, model_id=model_id, provider=provider) try: result = await handler(request) except Exception: + ot_metrics.record_model_call_duration( + (time.perf_counter() - t0) * 1000, + model=model_id, + provider=provider, + ) # span context manager records + re-raises raise else: - _annotate_model_response(sp, result) + input_tokens, output_tokens = _annotate_model_response( + sp, + result, + model_id=model_id, + provider=provider, + ) + ot_metrics.record_model_call_duration( + (time.perf_counter() - t0) * 1000, + model=model_id, + provider=provider, + ) + ot_metrics.record_model_token_usage( + input_tokens=input_tokens, + output_tokens=output_tokens, + model=model_id, + provider=provider, + ) return result # ------------------------------------------------------------------ @@ -87,9 +111,24 @@ class OtelSpanMiddleware(AgentMiddleware): tool_name = _resolve_tool_name(request) input_size = _resolve_input_size(request) + t0 = time.perf_counter() with ot.tool_call_span(tool_name, input_size=input_size) as sp: - result = await handler(request) - _annotate_tool_result(sp, result) + try: + result = await handler(request) + except Exception: + ot_metrics.record_tool_call_duration( + (time.perf_counter() - t0) * 1000, + tool_name=tool_name, + ) + ot_metrics.record_tool_call_error(tool_name=tool_name) + raise + errored = _annotate_tool_result(sp, result) + ot_metrics.record_tool_call_duration( + (time.perf_counter() - t0) * 1000, + tool_name=tool_name, + ) + if errored: + ot_metrics.record_tool_call_error(tool_name=tool_name) return result @@ -154,8 +193,29 @@ def _resolve_input_size(request: Any) -> int | None: return None -def _annotate_model_response(span: Any, result: Any) -> None: +def _annotate_model_request( + span: Any, *, model_id: str | None, provider: str | None +) -> None: + try: + span.set_attribute("gen_ai.operation.name", "chat") + if model_id: + span.set_attribute("gen_ai.request.model", model_id) + if provider: + span.set_attribute("gen_ai.provider.name", provider) + except Exception: # pragma: no cover — defensive + pass + + +def _annotate_model_response( + span: Any, + result: Any, + *, + model_id: str | None = None, + provider: str | None = None, +) -> tuple[int | None, int | None]: """Best-effort: attach prompt/completion token counts when available.""" + input_tokens: int | None = None + output_tokens: int | None = None try: # ModelResponse may be a dataclass with .result containing AIMessage msg: Any @@ -165,22 +225,42 @@ def _annotate_model_response(span: Any, result: Any) -> None: inner = getattr(result, "result", None) msg = inner[-1] if isinstance(inner, list) and inner else inner if msg is None: - return + return None, None + if provider: + span.set_attribute("gen_ai.provider.name", provider) + if model_id: + span.set_attribute("gen_ai.request.model", model_id) + response_model = getattr(msg, "response_metadata", {}) or {} + if isinstance(response_model, dict): + response_model = ( + response_model.get("model_name") + or response_model.get("model") + or response_model.get("model_id") + ) + if not response_model: + response_model = model_id + if response_model: + span.set_attribute("gen_ai.response.model", str(response_model)) + span.set_attribute("gen_ai.operation.name", "chat") usage = getattr(msg, "usage_metadata", None) or {} if isinstance(usage, dict): if (n := usage.get("input_tokens")) is not None: - span.set_attribute("tokens.prompt", int(n)) + input_tokens = int(n) + span.set_attribute("gen_ai.usage.input_tokens", input_tokens) if (n := usage.get("output_tokens")) is not None: - span.set_attribute("tokens.completion", int(n)) + output_tokens = int(n) + span.set_attribute("gen_ai.usage.output_tokens", output_tokens) if (n := usage.get("total_tokens")) is not None: - span.set_attribute("tokens.total", int(n)) + span.set_attribute("gen_ai.usage.total_tokens", int(n)) tool_calls = getattr(msg, "tool_calls", None) or [] span.set_attribute("model.tool_calls", len(tool_calls)) except Exception: # pragma: no cover — defensive pass + return input_tokens, output_tokens -def _annotate_tool_result(span: Any, result: Any) -> None: +def _annotate_tool_result(span: Any, result: Any) -> bool: + errored = False try: if isinstance(result, ToolMessage): content = ( @@ -192,11 +272,14 @@ def _annotate_tool_result(span: Any, result: Any) -> None: status = getattr(result, "status", None) if isinstance(status, str): span.set_attribute("tool.status", status) + errored = status.lower() == "error" kwargs = getattr(result, "additional_kwargs", None) or {} if isinstance(kwargs, dict) and kwargs.get("error"): span.set_attribute("tool.error", True) + errored = True except Exception: # pragma: no cover — defensive pass + return errored __all__ = ["OtelSpanMiddleware"] diff --git a/surfsense_backend/tests/unit/agents/new_chat/test_otel_span.py b/surfsense_backend/tests/unit/agents/new_chat/test_otel_span.py index 55434c04d..dc59c6dac 100644 --- a/surfsense_backend/tests/unit/agents/new_chat/test_otel_span.py +++ b/surfsense_backend/tests/unit/agents/new_chat/test_otel_span.py @@ -23,6 +23,7 @@ pytestmark = pytest.mark.unit @pytest.fixture(autouse=True) def _disable_otel(monkeypatch: pytest.MonkeyPatch): monkeypatch.delenv("OTEL_EXPORTER_OTLP_ENDPOINT", raising=False) + monkeypatch.delenv("OTEL_SDK_DISABLED", raising=False) monkeypatch.setenv("SURFSENSE_DISABLE_OTEL", "true") from app.observability import otel as ot @@ -99,16 +100,17 @@ class TestAnnotateModelResponse: "total_tokens": 150, }, ) - _annotate_model_response(sp, msg) - sp.set_attribute.assert_any_call("tokens.prompt", 100) - sp.set_attribute.assert_any_call("tokens.completion", 50) - sp.set_attribute.assert_any_call("tokens.total", 150) + assert _annotate_model_response(sp, msg) == (100, 50) + sp.set_attribute.assert_any_call("gen_ai.usage.input_tokens", 100) + sp.set_attribute.assert_any_call("gen_ai.usage.output_tokens", 50) + sp.set_attribute.assert_any_call("gen_ai.usage.total_tokens", 150) + sp.set_attribute.assert_any_call("gen_ai.operation.name", "chat") def test_handles_response_with_no_metadata(self) -> None: sp = MagicMock() msg = AIMessage(content="hello") # Should not raise even when usage_metadata is missing - _annotate_model_response(sp, msg) + assert _annotate_model_response(sp, msg) == (None, None) class TestAnnotateToolResult: @@ -119,7 +121,7 @@ class TestAnnotateToolResult: tool_call_id="abc", status="success", ) - _annotate_tool_result(sp, result) + assert _annotate_tool_result(sp, result) is False sp.set_attribute.assert_any_call("tool.output.size", len("result text")) sp.set_attribute.assert_any_call("tool.status", "success") @@ -130,7 +132,7 @@ class TestAnnotateToolResult: tool_call_id="abc", additional_kwargs={"error": {"code": "x"}}, ) - _annotate_tool_result(sp, result) + assert _annotate_tool_result(sp, result) is True sp.set_attribute.assert_any_call("tool.error", True) @@ -193,3 +195,91 @@ class TestMiddlewareIntegration: assert result.content == "enabled" finally: ot.reload_for_tests() + + async def test_enabled_model_call_records_metrics( + self, monkeypatch: pytest.MonkeyPatch + ) -> None: + monkeypatch.delenv("SURFSENSE_DISABLE_OTEL", raising=False) + monkeypatch.setenv("OTEL_EXPORTER_OTLP_ENDPOINT", "http://localhost:4317") + from app.observability import otel as ot + + duration_calls: list[dict[str, Any]] = [] + token_calls: list[dict[str, Any]] = [] + monkeypatch.setattr( + "app.agents.new_chat.middleware.otel_span.ot_metrics.record_model_call_duration", + lambda duration_ms, **attrs: duration_calls.append( + {"duration_ms": duration_ms, **attrs} + ), + ) + monkeypatch.setattr( + "app.agents.new_chat.middleware.otel_span.ot_metrics.record_model_token_usage", + lambda **attrs: token_calls.append(attrs), + ) + + ot.reload_for_tests() + try: + mw = OtelSpanMiddleware() + + async def handler(req): + return AIMessage( + content="enabled", + usage_metadata={ + "input_tokens": 3, + "output_tokens": 5, + "total_tokens": 8, + }, + ) + + request = MagicMock() + request.model = MagicMock() + request.model.model_name = "gpt-4o" + request.model.provider = "openai" + await mw.awrap_model_call(request, handler) + + assert duration_calls + assert token_calls == [ + { + "input_tokens": 3, + "output_tokens": 5, + "model": "gpt-4o", + "provider": "openai", + } + ] + finally: + ot.reload_for_tests() + + async def test_enabled_tool_call_records_error_metric( + self, monkeypatch: pytest.MonkeyPatch + ) -> None: + monkeypatch.delenv("SURFSENSE_DISABLE_OTEL", raising=False) + monkeypatch.setenv("OTEL_EXPORTER_OTLP_ENDPOINT", "http://localhost:4317") + from app.observability import otel as ot + + errors: list[str] = [] + monkeypatch.setattr( + "app.agents.new_chat.middleware.otel_span.ot_metrics.record_tool_call_error", + lambda *, tool_name: errors.append(tool_name), + ) + monkeypatch.setattr( + "app.agents.new_chat.middleware.otel_span.ot_metrics.record_tool_call_duration", + lambda *args, **kwargs: None, + ) + + ot.reload_for_tests() + try: + mw = OtelSpanMiddleware() + + async def handler(req): + return ToolMessage( + content="failed", + tool_call_id="abc", + status="error", + ) + + request = MagicMock() + request.tool = MagicMock() + request.tool.name = "web_search" + await mw.awrap_tool_call(request, handler) + assert errors == ["web_search"] + finally: + ot.reload_for_tests() From 53691f9c518c1a95be9f6f2881f0845603bfb4df Mon Sep 17 00:00:00 2001 From: Anish Sarkar <104695310+AnishSarkar22@users.noreply.github.com> Date: Thu, 21 May 2026 23:02:54 +0530 Subject: [PATCH 09/28] feat(agents): track permission and compaction events --- .../middleware/shared/permissions/ask/request.py | 3 +++ surfsense_backend/app/agents/new_chat/middleware/compaction.py | 2 ++ surfsense_backend/app/agents/new_chat/middleware/doom_loop.py | 2 ++ surfsense_backend/app/agents/new_chat/middleware/permission.py | 3 +++ 4 files changed, 10 insertions(+) diff --git a/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/permissions/ask/request.py b/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/permissions/ask/request.py index d61d38f34..a725bfee1 100644 --- a/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/permissions/ask/request.py +++ b/surfsense_backend/app/agents/multi_agent_chat/middleware/shared/permissions/ask/request.py @@ -17,6 +17,7 @@ from langchain_core.tools import BaseTool from langgraph.types import interrupt from app.agents.new_chat.permissions import Rule +from app.observability import metrics as ot_metrics from app.observability import otel as ot from .decision import normalize_permission_decision @@ -52,6 +53,8 @@ def request_permission_decision( ), ot.interrupt_span(interrupt_type=PERMISSION_ASK_INTERRUPT_TYPE), ): + ot_metrics.record_permission_ask(permission=tool_name) + ot_metrics.record_interrupt(interrupt_type=PERMISSION_ASK_INTERRUPT_TYPE) decision = interrupt(payload) return normalize_permission_decision(decision) diff --git a/surfsense_backend/app/agents/new_chat/middleware/compaction.py b/surfsense_backend/app/agents/new_chat/middleware/compaction.py index 16361e16b..8173976fe 100644 --- a/surfsense_backend/app/agents/new_chat/middleware/compaction.py +++ b/surfsense_backend/app/agents/new_chat/middleware/compaction.py @@ -34,6 +34,7 @@ from deepagents.middleware.summarization import ( ) from langchain_core.messages import SystemMessage +from app.observability import metrics as ot_metrics from app.observability import otel as ot if TYPE_CHECKING: @@ -178,6 +179,7 @@ class SurfSenseCompactionMiddleware(SummarizationMiddleware): messages_in=len(conversation_messages), extra={"compaction.cutoff_index": int(cutoff_index)}, ): + ot_metrics.record_compaction_run(reason="auto") messages_to_summarize, preserved_messages = super()._partition_messages( conversation_messages, cutoff_index ) diff --git a/surfsense_backend/app/agents/new_chat/middleware/doom_loop.py b/surfsense_backend/app/agents/new_chat/middleware/doom_loop.py index 850ecd1d2..d50cadc0e 100644 --- a/surfsense_backend/app/agents/new_chat/middleware/doom_loop.py +++ b/surfsense_backend/app/agents/new_chat/middleware/doom_loop.py @@ -47,6 +47,7 @@ from langgraph.config import get_config from langgraph.runtime import Runtime from langgraph.types import interrupt +from app.observability import metrics as ot_metrics from app.observability import otel as ot logger = logging.getLogger(__name__) @@ -195,6 +196,7 @@ class DoomLoopMiddleware(AgentMiddleware[AgentState[ResponseT], ContextT, Respon "interrupt.tool": (action or {}).get("tool", ""), }, ): + ot_metrics.record_interrupt(interrupt_type="permission_ask") decision = interrupt( { "type": "permission_ask", diff --git a/surfsense_backend/app/agents/new_chat/middleware/permission.py b/surfsense_backend/app/agents/new_chat/middleware/permission.py index f77b7e387..e174ab0bd 100644 --- a/surfsense_backend/app/agents/new_chat/middleware/permission.py +++ b/surfsense_backend/app/agents/new_chat/middleware/permission.py @@ -61,6 +61,7 @@ from app.agents.new_chat.permissions import ( aggregate_action, evaluate_many, ) +from app.observability import metrics as ot_metrics from app.observability import otel as ot logger = logging.getLogger(__name__) @@ -284,6 +285,8 @@ class PermissionMiddleware(AgentMiddleware): # type: ignore[type-arg] ), ot.interrupt_span(interrupt_type="permission_ask"), ): + ot_metrics.record_permission_ask(permission=tool_name) + ot_metrics.record_interrupt(interrupt_type="permission_ask") decision = interrupt(payload) return _normalize_permission_decision(decision) From b9d76f006dff959879b190af3c50d3925466d351 Mon Sep 17 00:00:00 2001 From: Anish Sarkar <104695310+AnishSarkar22@users.noreply.github.com> Date: Thu, 21 May 2026 23:03:31 +0530 Subject: [PATCH 10/28] feat(retriever): instrument knowledge base search --- .../app/retriever/chunks_hybrid_search.py | 41 +++++++++++++ .../app/retriever/documents_hybrid_search.py | 41 +++++++++++++ .../unit/observability/test_retriever_otel.py | 61 +++++++++++++++++++ 3 files changed, 143 insertions(+) create mode 100644 surfsense_backend/tests/unit/observability/test_retriever_otel.py diff --git a/surfsense_backend/app/retriever/chunks_hybrid_search.py b/surfsense_backend/app/retriever/chunks_hybrid_search.py index e32c6c43d..47f7fe6b1 100644 --- a/surfsense_backend/app/retriever/chunks_hybrid_search.py +++ b/surfsense_backend/app/retriever/chunks_hybrid_search.py @@ -1,13 +1,51 @@ import asyncio import contextlib +import functools import time from datetime import datetime +from app.observability import metrics as ot_metrics, otel as ot from app.utils.perf import get_perf_logger _MAX_FETCH_CHUNKS_PER_DOC = 20 +def _instrument_search(mode: str): + def _decorator(func): + @functools.wraps(func) + async def _wrapper( + self, query_text: str, top_k: int, search_space_id: int, *args, **kwargs + ): + t0 = time.perf_counter() + with ot.kb_search_span( + search_space_id=search_space_id, + query_chars=len(query_text), + extra={"search.surface": "chunks", "search.mode": mode}, + ) as sp: + try: + result = await func( + self, query_text, top_k, search_space_id, *args, **kwargs + ) + except Exception: + ot_metrics.record_kb_search_duration( + (time.perf_counter() - t0) * 1000, + search_space_id=search_space_id, + surface="chunks", + ) + raise + sp.set_attribute("result.count", len(result)) + ot_metrics.record_kb_search_duration( + (time.perf_counter() - t0) * 1000, + search_space_id=search_space_id, + surface="chunks", + ) + return result + + return _wrapper + + return _decorator + + class ChucksHybridSearchRetriever: def __init__(self, db_session): """ @@ -18,6 +56,7 @@ class ChucksHybridSearchRetriever: """ self.db_session = db_session + @_instrument_search("vector") async def vector_search( self, query_text: str, @@ -88,6 +127,7 @@ class ChucksHybridSearchRetriever: return chunks + @_instrument_search("full_text") async def full_text_search( self, query_text: str, @@ -153,6 +193,7 @@ class ChucksHybridSearchRetriever: return chunks + @_instrument_search("hybrid") async def hybrid_search( self, query_text: str, diff --git a/surfsense_backend/app/retriever/documents_hybrid_search.py b/surfsense_backend/app/retriever/documents_hybrid_search.py index 3eabdb004..9ce86d404 100644 --- a/surfsense_backend/app/retriever/documents_hybrid_search.py +++ b/surfsense_backend/app/retriever/documents_hybrid_search.py @@ -1,12 +1,50 @@ import contextlib +import functools import time from datetime import datetime +from app.observability import metrics as ot_metrics, otel as ot from app.utils.perf import get_perf_logger _MAX_FETCH_CHUNKS_PER_DOC = 20 +def _instrument_search(mode: str): + def _decorator(func): + @functools.wraps(func) + async def _wrapper( + self, query_text: str, top_k: int, search_space_id: int, *args, **kwargs + ): + t0 = time.perf_counter() + with ot.kb_search_span( + search_space_id=search_space_id, + query_chars=len(query_text), + extra={"search.surface": "documents", "search.mode": mode}, + ) as sp: + try: + result = await func( + self, query_text, top_k, search_space_id, *args, **kwargs + ) + except Exception: + ot_metrics.record_kb_search_duration( + (time.perf_counter() - t0) * 1000, + search_space_id=search_space_id, + surface="documents", + ) + raise + sp.set_attribute("result.count", len(result)) + ot_metrics.record_kb_search_duration( + (time.perf_counter() - t0) * 1000, + search_space_id=search_space_id, + surface="documents", + ) + return result + + return _wrapper + + return _decorator + + class DocumentHybridSearchRetriever: def __init__(self, db_session): """ @@ -17,6 +55,7 @@ class DocumentHybridSearchRetriever: """ self.db_session = db_session + @_instrument_search("vector") async def vector_search( self, query_text: str, @@ -81,6 +120,7 @@ class DocumentHybridSearchRetriever: return documents + @_instrument_search("full_text") async def full_text_search( self, query_text: str, @@ -145,6 +185,7 @@ class DocumentHybridSearchRetriever: return documents + @_instrument_search("hybrid") async def hybrid_search( self, query_text: str, diff --git a/surfsense_backend/tests/unit/observability/test_retriever_otel.py b/surfsense_backend/tests/unit/observability/test_retriever_otel.py new file mode 100644 index 000000000..9712a3150 --- /dev/null +++ b/surfsense_backend/tests/unit/observability/test_retriever_otel.py @@ -0,0 +1,61 @@ +"""Tests for retriever OTel wrappers.""" + +from __future__ import annotations + +from contextlib import contextmanager +from typing import Any + +import pytest + +from app.retriever.documents_hybrid_search import _instrument_search + +pytestmark = pytest.mark.unit + + +class _Span: + def __init__(self) -> None: + self.attrs: dict[str, Any] = {} + + def set_attribute(self, key: str, value: Any) -> None: + self.attrs[key] = value + + +@contextmanager +def _fake_span(**kwargs): + span = _Span() + span.attrs.update(kwargs) + yield span + + +@pytest.mark.asyncio +async def test_retriever_wrapper_records_one_span_and_metric(monkeypatch) -> None: + calls: list[dict[str, Any]] = [] + + monkeypatch.setattr( + "app.retriever.documents_hybrid_search.ot.kb_search_span", + lambda **kwargs: _fake_span(**kwargs), + ) + monkeypatch.setattr( + "app.retriever.documents_hybrid_search.ot_metrics.record_kb_search_duration", + lambda duration_ms, **attrs: calls.append( + {"duration_ms": duration_ms, **attrs} + ), + ) + + class Retriever: + @_instrument_search("hybrid") + async def search( + self, + query_text: str, + top_k: int, + search_space_id: int, + ) -> list[str]: + del query_text, top_k, search_space_id + return ["doc-1", "doc-2"] + + result = await Retriever().search("hello", 3, 42) + + assert result == ["doc-1", "doc-2"] + assert len(calls) == 1 + assert calls[0]["search_space_id"] == 42 + assert calls[0]["surface"] == "documents" From cea5605e327c4969efe133dd8223c4cf2d08184c Mon Sep 17 00:00:00 2001 From: Anish Sarkar <104695310+AnishSarkar22@users.noreply.github.com> Date: Thu, 21 May 2026 23:03:43 +0530 Subject: [PATCH 11/28] feat(indexing): track indexing and connector outcomes --- .../indexing_pipeline_service.py | 24 ++++++++++++++ .../app/tasks/celery_tasks/connector_tasks.py | 31 ++++++++++++++++++- 2 files changed, 54 insertions(+), 1 deletion(-) diff --git a/surfsense_backend/app/indexing_pipeline/indexing_pipeline_service.py b/surfsense_backend/app/indexing_pipeline/indexing_pipeline_service.py index 2339647ea..2aa92bd9b 100644 --- a/surfsense_backend/app/indexing_pipeline/indexing_pipeline_service.py +++ b/surfsense_backend/app/indexing_pipeline/indexing_pipeline_service.py @@ -57,6 +57,7 @@ from app.indexing_pipeline.pipeline_logger import ( log_retryable_llm_error, log_unexpected_error, ) +from app.observability import metrics as ot_metrics, otel as ot from app.utils.perf import get_perf_logger @@ -362,6 +363,16 @@ class IndexingPipelineService: ) perf = get_perf_logger() t_index = time.perf_counter() + document_type = ( + document.document_type.value + if getattr(document, "document_type", None) + else None + ) + persist_span_cm = ot.kb_persist_span( + document_type=document_type, + ) + persist_span = persist_span_cm.__enter__() + outcome_status = "failed" try: log_index_started(ctx) document.status = DocumentStatus.processing() @@ -429,11 +440,13 @@ class IndexingPipelineService: time.perf_counter() - t_index, ) log_index_success(ctx, chunk_count=len(chunks)) + outcome_status = "success" await self._enqueue_ai_sort_if_enabled(document) except RETRYABLE_LLM_ERRORS as e: log_retryable_llm_error(ctx, e) + outcome_status = "requeued" await rollback_and_persist_failure( self.session, document, llm_retryable_message(e) ) @@ -465,6 +478,17 @@ class IndexingPipelineService: with contextlib.suppress(Exception): await self.session.refresh(document) + with contextlib.suppress(Exception): + persist_span.set_attribute("indexing.status", outcome_status) + ot_metrics.record_indexing_document_duration( + time.perf_counter() - t_index, + document_type=document_type, + ) + ot_metrics.record_indexing_document_outcome( + document_type=document_type, + status=outcome_status, + ) + persist_span_cm.__exit__(None, None, None) return document async def _enqueue_ai_sort_if_enabled(self, document: Document) -> None: diff --git a/surfsense_backend/app/tasks/celery_tasks/connector_tasks.py b/surfsense_backend/app/tasks/celery_tasks/connector_tasks.py index 08d96cfa0..86296c3d9 100644 --- a/surfsense_backend/app/tasks/celery_tasks/connector_tasks.py +++ b/surfsense_backend/app/tasks/celery_tasks/connector_tasks.py @@ -1,14 +1,43 @@ """Celery tasks for connector indexing.""" import logging +import time import traceback +from collections.abc import Awaitable, Callable + +from celery import current_task from app.celery_app import celery_app -from app.tasks.celery_tasks import get_celery_session_maker, run_async_celery_task +from app.observability import metrics as ot_metrics +from app.tasks.celery_tasks import ( + get_celery_session_maker, + run_async_celery_task as _run_async_celery_task, +) logger = logging.getLogger(__name__) +def run_async_celery_task[T](coro_factory: Callable[[], Awaitable[T]]) -> T: + """Run connector sync work and record aggregate connector metrics.""" + task_name = getattr(current_task, "name", None) or "unknown" + t0 = time.perf_counter() + status = "failed" + try: + result = _run_async_celery_task(coro_factory) + status = "success" + return result + finally: + elapsed_s = time.perf_counter() - t0 + ot_metrics.record_connector_sync_duration( + elapsed_s, + connector_type=task_name, + ) + ot_metrics.record_connector_sync_outcome( + connector_type=task_name, + status=status, + ) + + def _handle_greenlet_error(e: Exception, task_name: str, connector_id: int) -> None: """ Handle greenlet_spawn errors with detailed logging for debugging. From 21d9b1f2186d832bed0348938cc87a08d9d430d3 Mon Sep 17 00:00:00 2001 From: Anish Sarkar <104695310+AnishSarkar22@users.noreply.github.com> Date: Fri, 22 May 2026 13:47:10 +0530 Subject: [PATCH 12/28] fix(observability): sanitize outbound HTTP span URLs --- .../app/observability/bootstrap.py | 29 ++++++++++++++++++- 1 file changed, 28 insertions(+), 1 deletion(-) diff --git a/surfsense_backend/app/observability/bootstrap.py b/surfsense_backend/app/observability/bootstrap.py index a1b9817da..ad1d02ea8 100644 --- a/surfsense_backend/app/observability/bootstrap.py +++ b/surfsense_backend/app/observability/bootstrap.py @@ -8,6 +8,7 @@ import os import socket from importlib import metadata from typing import Any +from urllib.parse import urlsplit, urlunsplit from app.observability import otel @@ -122,6 +123,27 @@ def _safe_instrument(name: str, instrument: Any) -> bool: return True +def _url_without_query(raw_url: Any) -> str | None: + try: + parts = urlsplit(str(raw_url)) + except Exception: + return None + if not parts.scheme or not parts.netloc: + return None + return urlunsplit((parts.scheme, parts.netloc, parts.path or "/", "", "")) + + +def _sanitize_http_span_url(span: Any, request: Any) -> None: + sanitized = _url_without_query(getattr(request, "url", None)) + if not sanitized: + return + with contextlib.suppress(Exception): + # Keep both old and current semantic-convention names safe. The + # collector can drop one later without needing application changes. + span.set_attribute("http.url", sanitized) + span.set_attribute("url.full", sanitized) + + def _instrument_fastapi(app: Any | None) -> None: global _FASTAPI_INSTRUMENTED if app is None or _FASTAPI_INSTRUMENTED: @@ -202,7 +224,12 @@ def _instrument_httpx() -> None: def _run() -> None: from opentelemetry.instrumentation.httpx import HTTPXClientInstrumentor - HTTPXClientInstrumentor().instrument() + HTTPXClientInstrumentor().instrument( + request_hook=lambda span, request: _sanitize_http_span_url(span, request), + response_hook=lambda span, request, _response: _sanitize_http_span_url( + span, request + ), + ) if _safe_instrument("HTTPX", _run): _HTTPX_INSTRUMENTED = True From f7f49de10941cc8c3f3337207f28780cb04d285f Mon Sep 17 00:00:00 2001 From: Anish Sarkar <104695310+AnishSarkar22@users.noreply.github.com> Date: Fri, 22 May 2026 13:47:50 +0530 Subject: [PATCH 13/28] feat(observability): add chat subagent and ETL telemetry primitives --- .../app/observability/metrics.py | 180 ++++++++++++++++++ surfsense_backend/app/observability/otel.py | 154 +++++++++++++++ 2 files changed, 334 insertions(+) diff --git a/surfsense_backend/app/observability/metrics.py b/surfsense_backend/app/observability/metrics.py index fb99b4cc8..53beb959d 100644 --- a/surfsense_backend/app/observability/metrics.py +++ b/surfsense_backend/app/observability/metrics.py @@ -195,6 +195,73 @@ def _perf_elapsed(): ) +@lru_cache(maxsize=1) +def _chat_request_duration(): + return _get_meter().create_histogram( + "surfsense.chat.request.duration", + unit="ms", + description="Duration of SurfSense streamed chat requests.", + ) + + +@lru_cache(maxsize=1) +def _chat_request_outcome(): + return _get_meter().create_counter( + "surfsense.chat.request.outcome", + description="Count of SurfSense chat request outcomes.", + ) + + +@lru_cache(maxsize=1) +def _subagent_invoke_duration(): + return _get_meter().create_histogram( + "surfsense.subagent.invoke.duration", + unit="ms", + description="Duration of SurfSense subagent invocations.", + ) + + +@lru_cache(maxsize=1) +def _subagent_invoke_outcome(): + return _get_meter().create_counter( + "surfsense.subagent.invoke.outcome", + description="Count of SurfSense subagent invocation outcomes.", + ) + + +@lru_cache(maxsize=1) +def _etl_extract_duration(): + return _get_meter().create_histogram( + "surfsense.etl.extract.duration", + unit="s", + description="Duration of SurfSense ETL extraction.", + ) + + +@lru_cache(maxsize=1) +def _etl_extract_outcome(): + return _get_meter().create_counter( + "surfsense.etl.extract.outcome", + description="Count of SurfSense ETL extraction outcomes.", + ) + + +@lru_cache(maxsize=1) +def _celery_heartbeat_refreshes(): + return _get_meter().create_counter( + "surfsense.celery.heartbeat.refreshes", + description="Count of SurfSense Celery heartbeat refreshes.", + ) + + +@lru_cache(maxsize=1) +def _celery_heartbeat_failures(): + return _get_meter().create_counter( + "surfsense.celery.heartbeat.failures", + description="Count of SurfSense Celery heartbeat failures.", + ) + + def record_model_call_duration( duration_ms: float, *, model: str | None, provider: str | None ) -> None: @@ -312,6 +379,111 @@ def record_perf_elapsed(duration_ms: float, *, label: str) -> None: _record(_perf_elapsed(), duration_ms, {"label": label}) +def record_chat_request_duration( + duration_ms: float, + *, + flow: str, + outcome: str, + agent_mode: str | None = None, +) -> None: + _record( + _chat_request_duration(), + duration_ms, + {"chat.flow": flow, "outcome": outcome, "agent.mode": agent_mode}, + ) + + +def record_chat_request_outcome( + *, + flow: str, + outcome: str, + agent_mode: str | None = None, +) -> None: + _add( + _chat_request_outcome(), + 1, + {"chat.flow": flow, "outcome": outcome, "agent.mode": agent_mode}, + ) + + +def record_subagent_invoke_duration( + duration_ms: float, + *, + subagent_type: str, + path: str | None, + outcome: str, +) -> None: + _record( + _subagent_invoke_duration(), + duration_ms, + { + "subagent.type": subagent_type, + "subagent.path": path or "unknown", + "outcome": outcome, + }, + ) + + +def record_subagent_invoke_outcome( + *, + subagent_type: str, + path: str | None, + outcome: str, +) -> None: + _add( + _subagent_invoke_outcome(), + 1, + { + "subagent.type": subagent_type, + "subagent.path": path or "unknown", + "outcome": outcome, + }, + ) + + +def record_etl_extract_duration( + duration_s: float, + *, + etl_service: str | None, + content_type: str | None, + status: str, +) -> None: + _record( + _etl_extract_duration(), + duration_s, + { + "etl.service": etl_service or "unknown", + "content.type": content_type or "unknown", + "status": status, + }, + ) + + +def record_etl_extract_outcome( + *, + etl_service: str | None, + content_type: str | None, + status: str, +) -> None: + _add( + _etl_extract_outcome(), + 1, + { + "etl.service": etl_service or "unknown", + "content.type": content_type or "unknown", + "status": status, + }, + ) + + +def record_celery_heartbeat_refresh(*, heartbeat_type: str) -> None: + _add(_celery_heartbeat_refreshes(), 1, {"heartbeat.type": heartbeat_type}) + + +def record_celery_heartbeat_failure(*, heartbeat_type: str) -> None: + _add(_celery_heartbeat_failures(), 1, {"heartbeat.type": heartbeat_type}) + + def _runtime_snapshot_value(key: str, transform: Any = None) -> list[Any]: from opentelemetry.metrics import Observation @@ -398,9 +570,15 @@ def register_runtime_observables() -> None: __all__ = [ "record_auth_failure", + "record_celery_heartbeat_failure", + "record_celery_heartbeat_refresh", + "record_chat_request_duration", + "record_chat_request_outcome", "record_compaction_run", "record_connector_sync_duration", "record_connector_sync_outcome", + "record_etl_extract_duration", + "record_etl_extract_outcome", "record_indexing_document_duration", "record_indexing_document_outcome", "record_interrupt", @@ -410,6 +588,8 @@ __all__ = [ "record_perf_elapsed", "record_permission_ask", "record_rate_limit_rejection", + "record_subagent_invoke_duration", + "record_subagent_invoke_outcome", "record_tool_call_duration", "record_tool_call_error", "register_runtime_observables", diff --git a/surfsense_backend/app/observability/otel.py b/surfsense_backend/app/observability/otel.py index f39cfe535..e4d4a1fd9 100644 --- a/surfsense_backend/app/observability/otel.py +++ b/surfsense_backend/app/observability/otel.py @@ -244,6 +244,152 @@ def kb_persist_span( return span("kb.persist", attributes=attrs) +def chat_request_span( + *, + chat_id: int | None = None, + search_space_id: int | None = None, + flow: str | None = None, + request_id: str | None = None, + turn_id: str | None = None, + filesystem_mode: str | None = None, + client_platform: str | None = None, + agent_mode: str | None = None, + extra: dict[str, Any] | None = None, +): + """Parent span for a single streamed chat or resume turn.""" + attrs: dict[str, Any] = {} + if chat_id is not None: + attrs["chat.id"] = int(chat_id) + if search_space_id is not None: + attrs["search_space.id"] = int(search_space_id) + if flow: + attrs["chat.flow"] = flow + if request_id: + attrs["request.id"] = request_id + if turn_id: + attrs["turn.id"] = turn_id + if filesystem_mode: + attrs["filesystem.mode"] = filesystem_mode + if client_platform: + attrs["client.platform"] = client_platform + if agent_mode: + attrs["agent.mode"] = agent_mode + if extra: + attrs.update(extra) + return span("chat.request", attributes=attrs) + + +def subagent_invoke_span( + *, + subagent_type: str, + path: str | None = None, + extra: dict[str, Any] | None = None, +): + """Span around invoking a delegated subagent from the main agent.""" + attrs: dict[str, Any] = {"subagent.type": subagent_type} + if path: + attrs["subagent.path"] = path + if extra: + attrs.update(extra) + return span("subagent.invoke", attributes=attrs) + + +def connector_sync_span( + *, + connector_type: str | None, + extra: dict[str, Any] | None = None, +): + """Business-level span around connector indexing task execution.""" + attrs: dict[str, Any] = {"connector.type": connector_type or "unknown"} + if extra: + attrs.update(extra) + return span("connector.sync", attributes=attrs) + + +def etl_extract_span( + *, + content_type: str | None = None, + file_extension: str | None = None, + processing_mode: str | None = None, + extra: dict[str, Any] | None = None, +): + """Span around top-level ETL extraction for a file.""" + attrs: dict[str, Any] = {} + if content_type: + attrs["content.type"] = content_type + if file_extension: + attrs["file.extension"] = file_extension + if processing_mode: + attrs["processing.mode"] = processing_mode + if extra: + attrs.update(extra) + return span("etl.extract", attributes=attrs) + + +def etl_parse_span( + *, + etl_service: str | None, + content_type: str | None = None, + file_extension: str | None = None, + processing_mode: str | None = None, + extra: dict[str, Any] | None = None, +): + """Span around a concrete ETL parser/backend call.""" + attrs: dict[str, Any] = {"etl.service": etl_service or "unknown"} + if content_type: + attrs["content.type"] = content_type + if file_extension: + attrs["file.extension"] = file_extension + if processing_mode: + attrs["processing.mode"] = processing_mode + if extra: + attrs.update(extra) + return span("etl.parse", attributes=attrs) + + +def etl_ocr_span( + *, + etl_service: str | None, + file_extension: str | None = None, + extra: dict[str, Any] | None = None, +): + """Span around OCR extraction from image content.""" + attrs: dict[str, Any] = {"etl.service": etl_service or "unknown"} + if file_extension: + attrs["file.extension"] = file_extension + if extra: + attrs.update(extra) + return span("etl.ocr", attributes=attrs) + + +def etl_picture_describe_span( + *, + image_count: int | None = None, + extra: dict[str, Any] | None = None, +): + """Span around describing embedded images in a document.""" + attrs: dict[str, Any] = {} + if image_count is not None: + attrs["image.count"] = int(image_count) + if extra: + attrs.update(extra) + return span("etl.picture.describe", attributes=attrs) + + +def etl_picture_ocr_span( + *, + file_extension: str | None = None, + extra: dict[str, Any] | None = None, +): + """Span around per-image OCR during picture description.""" + attrs: dict[str, Any] = {} + if file_extension: + attrs["file.extension"] = file_extension + if extra: + attrs.update(extra) + return span("etl.picture.ocr", attributes=attrs) + + def compaction_span( *, reason: str | None = None, @@ -306,7 +452,14 @@ def reload_for_tests() -> bool: __all__ = [ + "chat_request_span", "compaction_span", + "connector_sync_span", + "etl_extract_span", + "etl_ocr_span", + "etl_parse_span", + "etl_picture_describe_span", + "etl_picture_ocr_span", "interrupt_span", "is_enabled", "kb_persist_span", @@ -315,5 +468,6 @@ __all__ = [ "permission_asked_span", "reload_for_tests", "span", + "subagent_invoke_span", "tool_call_span", ] From 5a6b92c2b64a48ecb4a51ab32dd674bb4ac219ee Mon Sep 17 00:00:00 2001 From: Anish Sarkar <104695310+AnishSarkar22@users.noreply.github.com> Date: Fri, 22 May 2026 13:48:19 +0530 Subject: [PATCH 14/28] feat(chat): instrument streamed chat request telemetry --- .../app/tasks/chat/stream_new_chat.py | 71 +++++++++++++++++++ 1 file changed, 71 insertions(+) diff --git a/surfsense_backend/app/tasks/chat/stream_new_chat.py b/surfsense_backend/app/tasks/chat/stream_new_chat.py index c9faa1691..b961733b4 100644 --- a/surfsense_backend/app/tasks/chat/stream_new_chat.py +++ b/surfsense_backend/app/tasks/chat/stream_new_chat.py @@ -58,6 +58,7 @@ from app.db import ( async_session_maker, shielded_async_session, ) +from app.observability import metrics as ot_metrics, otel as ot from app.prompts import TITLE_GENERATION_PROMPT from app.services.auto_model_pin_service import ( mark_runtime_cooldown, @@ -883,6 +884,19 @@ async def stream_new_chat( stream_result.turn_id = f"{chat_id}:{int(time.time() * 1000)}" stream_result.filesystem_mode = fs_mode stream_result.client_platform = fs_platform + chat_agent_mode = "unknown" + chat_outcome = "success" + chat_span_cm = ot.chat_request_span( + chat_id=chat_id, + search_space_id=search_space_id, + flow=flow, + request_id=request_id, + turn_id=stream_result.turn_id, + filesystem_mode=fs_mode, + client_platform=fs_platform, + agent_mode=chat_agent_mode, + ) + chat_span = chat_span_cm.__enter__() _log_file_contract("turn_start", stream_result) _perf_log.info( "[stream_new_chat] filesystem_mode=%s client_platform=%s", @@ -1189,6 +1203,9 @@ async def stream_new_chat( from app.config import config as _app_config use_multi_agent = bool(_app_config.MULTI_AGENT_CHAT_ENABLED) + chat_agent_mode = "multi" if use_multi_agent else "single" + with contextlib.suppress(Exception): + chat_span.set_attribute("agent.mode", chat_agent_mode) _t0 = time.perf_counter() agent_factory = ( @@ -2011,6 +2028,10 @@ async def stream_new_chat( user_message, error_extra, ) = _classify_stream_exception(e, flow_label="chat") + chat_outcome = error_code or error_kind or "error" + with contextlib.suppress(Exception): + chat_span.set_attribute("chat.outcome", chat_outcome) + chat_span.record_exception(e) error_message = f"Error during chat: {e!s}" print(f"[stream_new_chat] {error_message}") print(f"[stream_new_chat] Exception type: {type(e).__name__}") @@ -2201,6 +2222,20 @@ async def stream_new_chat( ) trim_native_heap() log_system_snapshot("stream_new_chat_END") + with contextlib.suppress(Exception): + chat_span.set_attribute("chat.outcome", chat_outcome) + ot_metrics.record_chat_request_duration( + (time.perf_counter() - _t_total) * 1000, + flow=flow, + outcome=chat_outcome, + agent_mode=chat_agent_mode, + ) + ot_metrics.record_chat_request_outcome( + flow=flow, + outcome=chat_outcome, + agent_mode=chat_agent_mode, + ) + chat_span_cm.__exit__(None, None, None) async def stream_resume_chat( @@ -2225,6 +2260,19 @@ async def stream_resume_chat( stream_result.turn_id = f"{chat_id}:{int(time.time() * 1000)}" stream_result.filesystem_mode = fs_mode stream_result.client_platform = fs_platform + chat_agent_mode = "unknown" + chat_outcome = "success" + chat_span_cm = ot.chat_request_span( + chat_id=chat_id, + search_space_id=search_space_id, + flow="resume", + request_id=request_id, + turn_id=stream_result.turn_id, + filesystem_mode=fs_mode, + client_platform=fs_platform, + agent_mode=chat_agent_mode, + ) + chat_span = chat_span_cm.__enter__() _log_file_contract("turn_start", stream_result) _perf_log.info( "[stream_resume] filesystem_mode=%s client_platform=%s", @@ -2454,6 +2502,11 @@ async def stream_resume_chat( visibility = thread_visibility or ChatVisibility.PRIVATE from app.config import config as _app_config + chat_agent_mode = ( + "multi" if _app_config.MULTI_AGENT_CHAT_ENABLED else "single" + ) + with contextlib.suppress(Exception): + chat_span.set_attribute("agent.mode", chat_agent_mode) _t0 = time.perf_counter() agent_factory = ( create_multi_agent_chat_deep_agent @@ -2815,6 +2868,10 @@ async def stream_resume_chat( user_message, error_extra, ) = _classify_stream_exception(e, flow_label="resume") + chat_outcome = error_code or error_kind or "error" + with contextlib.suppress(Exception): + chat_span.set_attribute("chat.outcome", chat_outcome) + chat_span.record_exception(e) error_message = f"Error during resume: {e!s}" print(f"[stream_resume_chat] {error_message}") print(f"[stream_resume_chat] Traceback:\n{traceback.format_exc()}") @@ -2964,3 +3021,17 @@ async def stream_resume_chat( ) trim_native_heap() log_system_snapshot("stream_resume_chat_END") + with contextlib.suppress(Exception): + chat_span.set_attribute("chat.outcome", chat_outcome) + ot_metrics.record_chat_request_duration( + (time.perf_counter() - _t_total) * 1000, + flow="resume", + outcome=chat_outcome, + agent_mode=chat_agent_mode, + ) + ot_metrics.record_chat_request_outcome( + flow="resume", + outcome=chat_outcome, + agent_mode=chat_agent_mode, + ) + chat_span_cm.__exit__(None, None, None) From 8bca29fe0d165f763da70bc82d925931f8cf22e8 Mon Sep 17 00:00:00 2001 From: Anish Sarkar <104695310+AnishSarkar22@users.noreply.github.com> Date: Fri, 22 May 2026 13:48:57 +0530 Subject: [PATCH 15/28] feat(agents): track subagent invocation telemetry --- .../task_tool.py | 243 ++++++++++++++---- 1 file changed, 199 insertions(+), 44 deletions(-) diff --git a/surfsense_backend/app/agents/multi_agent_chat/middleware/main_agent/checkpointed_subagent_middleware/task_tool.py b/surfsense_backend/app/agents/multi_agent_chat/middleware/main_agent/checkpointed_subagent_middleware/task_tool.py index f6a9ff146..c3babab83 100644 --- a/surfsense_backend/app/agents/multi_agent_chat/middleware/main_agent/checkpointed_subagent_middleware/task_tool.py +++ b/surfsense_backend/app/agents/multi_agent_chat/middleware/main_agent/checkpointed_subagent_middleware/task_tool.py @@ -20,6 +20,7 @@ from langchain_core.tools import StructuredTool from langgraph.errors import GraphInterrupt from langgraph.types import Command, Interrupt +from app.observability import metrics as ot_metrics, otel as ot from app.utils.perf import get_perf_logger from .config import ( @@ -173,6 +174,9 @@ def build_task_tool_with_parent_config( exc_info=True, ) + invoke_path = "resume" if pending_value is not None else "fresh" + invoke_start = time.perf_counter() + invoke_outcome = "ok" if pending_value is not None: resume_value = consume_surfsense_resume(runtime) if resume_value is None: @@ -188,18 +192,94 @@ def build_task_tool_with_parent_config( # Prevent the parent's resume payload from leaking into subagent # interrupts via langgraph's parent_scratchpad fallback. drain_parent_null_resume(runtime) - try: - result = subagent.invoke( - build_resume_command(resume_value, pending_id), - config=sub_config, - ) - except GraphInterrupt as gi: - _reraise_stamped_subagent_interrupt(gi, runtime.tool_call_id) + with ot.subagent_invoke_span( + subagent_type=subagent_type, path=invoke_path + ) as sp: + try: + result = subagent.invoke( + build_resume_command(resume_value, pending_id), + config=sub_config, + ) + sp.set_attribute("subagent.outcome", invoke_outcome) + except GraphInterrupt as gi: + invoke_outcome = "interrupted" + sp.set_attribute("subagent.outcome", invoke_outcome) + ot_metrics.record_subagent_invoke_duration( + (time.perf_counter() - invoke_start) * 1000, + subagent_type=subagent_type, + path=invoke_path, + outcome=invoke_outcome, + ) + ot_metrics.record_subagent_invoke_outcome( + subagent_type=subagent_type, + path=invoke_path, + outcome=invoke_outcome, + ) + _reraise_stamped_subagent_interrupt(gi, runtime.tool_call_id) + except Exception: + invoke_outcome = "error" + sp.set_attribute("subagent.outcome", invoke_outcome) + ot_metrics.record_subagent_invoke_duration( + (time.perf_counter() - invoke_start) * 1000, + subagent_type=subagent_type, + path=invoke_path, + outcome=invoke_outcome, + ) + ot_metrics.record_subagent_invoke_outcome( + subagent_type=subagent_type, + path=invoke_path, + outcome=invoke_outcome, + ) + raise else: - try: - result = subagent.invoke(subagent_state, config=sub_config) - except GraphInterrupt as gi: - _reraise_stamped_subagent_interrupt(gi, runtime.tool_call_id) + with ot.subagent_invoke_span( + subagent_type=subagent_type, path=invoke_path + ) as sp: + try: + result = subagent.invoke(subagent_state, config=sub_config) + sp.set_attribute("subagent.outcome", invoke_outcome) + except GraphInterrupt as gi: + invoke_outcome = "interrupted" + sp.set_attribute("subagent.outcome", invoke_outcome) + ot_metrics.record_subagent_invoke_duration( + (time.perf_counter() - invoke_start) * 1000, + subagent_type=subagent_type, + path=invoke_path, + outcome=invoke_outcome, + ) + ot_metrics.record_subagent_invoke_outcome( + subagent_type=subagent_type, + path=invoke_path, + outcome=invoke_outcome, + ) + _reraise_stamped_subagent_interrupt(gi, runtime.tool_call_id) + except Exception: + invoke_outcome = "error" + sp.set_attribute("subagent.outcome", invoke_outcome) + ot_metrics.record_subagent_invoke_duration( + (time.perf_counter() - invoke_start) * 1000, + subagent_type=subagent_type, + path=invoke_path, + outcome=invoke_outcome, + ) + ot_metrics.record_subagent_invoke_outcome( + subagent_type=subagent_type, + path=invoke_path, + outcome=invoke_outcome, + ) + raise + invoke_elapsed_ms = (time.perf_counter() - invoke_start) * 1000 + ot_metrics.record_subagent_invoke_duration( + invoke_elapsed_ms, + subagent_type=subagent_type, + path=invoke_path, + outcome=invoke_outcome, + ) + ot_metrics.record_subagent_invoke_outcome( + subagent_type=subagent_type, + path=invoke_path, + outcome=invoke_outcome, + ) return _return_command_with_state_update(result, runtime.tool_call_id) async def atask( @@ -274,40 +354,104 @@ def build_task_tool_with_parent_config( # Prevent the parent's resume payload from leaking into subagent # interrupts via langgraph's parent_scratchpad fallback. drain_parent_null_resume(runtime) - try: - result = await subagent.ainvoke( - build_resume_command(resume_value, pending_id), - config=sub_config, - ) - except GraphInterrupt as gi: - ainvoke_outcome = "interrupted" - _perf_log.info( - "[hitl_route] atask EXIT subagent_type=%r path=%s outcome=%s " - "aget_state=%.3fs ainvoke=%.3fs total=%.3fs", - subagent_type, - invoke_path, - ainvoke_outcome, - aget_state_elapsed, - time.perf_counter() - ainvoke_start, - time.perf_counter() - atask_start, - ) - _reraise_stamped_subagent_interrupt(gi, runtime.tool_call_id) + with ot.subagent_invoke_span( + subagent_type=subagent_type, path=invoke_path + ) as sp: + try: + result = await subagent.ainvoke( + build_resume_command(resume_value, pending_id), + config=sub_config, + ) + sp.set_attribute("subagent.outcome", ainvoke_outcome) + except GraphInterrupt as gi: + ainvoke_outcome = "interrupted" + sp.set_attribute("subagent.outcome", ainvoke_outcome) + ot_metrics.record_subagent_invoke_duration( + (time.perf_counter() - ainvoke_start) * 1000, + subagent_type=subagent_type, + path=invoke_path, + outcome=ainvoke_outcome, + ) + ot_metrics.record_subagent_invoke_outcome( + subagent_type=subagent_type, + path=invoke_path, + outcome=ainvoke_outcome, + ) + _perf_log.info( + "[hitl_route] atask EXIT subagent_type=%r path=%s outcome=%s " + "aget_state=%.3fs ainvoke=%.3fs total=%.3fs", + subagent_type, + invoke_path, + ainvoke_outcome, + aget_state_elapsed, + time.perf_counter() - ainvoke_start, + time.perf_counter() - atask_start, + ) + _reraise_stamped_subagent_interrupt(gi, runtime.tool_call_id) + except Exception: + ainvoke_outcome = "error" + sp.set_attribute("subagent.outcome", ainvoke_outcome) + ot_metrics.record_subagent_invoke_duration( + (time.perf_counter() - ainvoke_start) * 1000, + subagent_type=subagent_type, + path=invoke_path, + outcome=ainvoke_outcome, + ) + ot_metrics.record_subagent_invoke_outcome( + subagent_type=subagent_type, + path=invoke_path, + outcome=ainvoke_outcome, + ) + raise else: - try: - result = await subagent.ainvoke(subagent_state, config=sub_config) - except GraphInterrupt as gi: - ainvoke_outcome = "interrupted" - _perf_log.info( - "[hitl_route] atask EXIT subagent_type=%r path=%s outcome=%s " - "aget_state=%.3fs ainvoke=%.3fs total=%.3fs", - subagent_type, - invoke_path, - ainvoke_outcome, - aget_state_elapsed, - time.perf_counter() - ainvoke_start, - time.perf_counter() - atask_start, - ) - _reraise_stamped_subagent_interrupt(gi, runtime.tool_call_id) + with ot.subagent_invoke_span( + subagent_type=subagent_type, path=invoke_path + ) as sp: + try: + result = await subagent.ainvoke( + subagent_state, config=sub_config + ) + sp.set_attribute("subagent.outcome", ainvoke_outcome) + except GraphInterrupt as gi: + ainvoke_outcome = "interrupted" + sp.set_attribute("subagent.outcome", ainvoke_outcome) + ot_metrics.record_subagent_invoke_duration( + (time.perf_counter() - ainvoke_start) * 1000, + subagent_type=subagent_type, + path=invoke_path, + outcome=ainvoke_outcome, + ) + ot_metrics.record_subagent_invoke_outcome( + subagent_type=subagent_type, + path=invoke_path, + outcome=ainvoke_outcome, + ) + _perf_log.info( + "[hitl_route] atask EXIT subagent_type=%r path=%s outcome=%s " + "aget_state=%.3fs ainvoke=%.3fs total=%.3fs", + subagent_type, + invoke_path, + ainvoke_outcome, + aget_state_elapsed, + time.perf_counter() - ainvoke_start, + time.perf_counter() - atask_start, + ) + _reraise_stamped_subagent_interrupt(gi, runtime.tool_call_id) + except Exception: + ainvoke_outcome = "error" + sp.set_attribute("subagent.outcome", ainvoke_outcome) + ot_metrics.record_subagent_invoke_duration( + (time.perf_counter() - ainvoke_start) * 1000, + subagent_type=subagent_type, + path=invoke_path, + outcome=ainvoke_outcome, + ) + ot_metrics.record_subagent_invoke_outcome( + subagent_type=subagent_type, + path=invoke_path, + outcome=ainvoke_outcome, + ) + raise ainvoke_elapsed = time.perf_counter() - ainvoke_start except GraphInterrupt: raise @@ -326,6 +470,17 @@ def build_task_tool_with_parent_config( merge_elapsed, time.perf_counter() - atask_start, ) + ot_metrics.record_subagent_invoke_duration( + ainvoke_elapsed * 1000, + subagent_type=subagent_type, + path=invoke_path, + outcome=ainvoke_outcome, + ) + ot_metrics.record_subagent_invoke_outcome( + subagent_type=subagent_type, + path=invoke_path, + outcome=ainvoke_outcome, + ) return cmd return StructuredTool.from_function( From 4e3a6dff465f979c3705a53942695ad2f899c7ee Mon Sep 17 00:00:00 2001 From: Anish Sarkar <104695310+AnishSarkar22@users.noreply.github.com> Date: Fri, 22 May 2026 13:49:42 +0530 Subject: [PATCH 16/28] feat(etl): instrument extraction spans and outcomes --- .../app/etl_pipeline/etl_pipeline_service.py | 176 ++++++++++++------ 1 file changed, 124 insertions(+), 52 deletions(-) diff --git a/surfsense_backend/app/etl_pipeline/etl_pipeline_service.py b/surfsense_backend/app/etl_pipeline/etl_pipeline_service.py index 87e8138fd..4cdd387b0 100644 --- a/surfsense_backend/app/etl_pipeline/etl_pipeline_service.py +++ b/surfsense_backend/app/etl_pipeline/etl_pipeline_service.py @@ -1,4 +1,7 @@ +import contextlib import logging +import time +from pathlib import PurePosixPath from app.config import config as app_config from app.etl_pipeline.etl_document import EtlRequest, EtlResult @@ -10,6 +13,11 @@ from app.etl_pipeline.file_classifier import FileCategory, classify_file from app.etl_pipeline.parsers.audio import transcribe_audio from app.etl_pipeline.parsers.direct_convert import convert_file_directly from app.etl_pipeline.parsers.plaintext import read_plaintext +from app.observability import metrics as ot_metrics, otel as ot + + +def _file_extension(filename: str) -> str: + return PurePosixPath(filename).suffix.lower() or "none" class EtlPipelineService: @@ -20,49 +28,88 @@ class EtlPipelineService: async def extract(self, request: EtlRequest) -> EtlResult: category = classify_file(request.filename) + start = time.perf_counter() + status = "success" + result: EtlResult | None = None + with ot.etl_extract_span( + content_type=category.value, + file_extension=_file_extension(request.filename), + processing_mode=request.processing_mode.value, + ) as sp: + try: + if category == FileCategory.UNSUPPORTED: + raise EtlUnsupportedFileError( + f"File type not supported for parsing: {request.filename}" + ) - if category == FileCategory.UNSUPPORTED: - raise EtlUnsupportedFileError( - f"File type not supported for parsing: {request.filename}" - ) + if category == FileCategory.PLAINTEXT: + content = read_plaintext(request.file_path) + result = EtlResult( + markdown_content=content, + etl_service="PLAINTEXT", + content_type="plaintext", + ) + return result - if category == FileCategory.PLAINTEXT: - content = read_plaintext(request.file_path) - return EtlResult( - markdown_content=content, - etl_service="PLAINTEXT", - content_type="plaintext", - ) + if category == FileCategory.DIRECT_CONVERT: + content = convert_file_directly(request.file_path, request.filename) + result = EtlResult( + markdown_content=content, + etl_service="DIRECT_CONVERT", + content_type="direct_convert", + ) + return result - if category == FileCategory.DIRECT_CONVERT: - content = convert_file_directly(request.file_path, request.filename) - return EtlResult( - markdown_content=content, - etl_service="DIRECT_CONVERT", - content_type="direct_convert", - ) + if category == FileCategory.AUDIO: + content = await transcribe_audio(request.file_path, request.filename) + result = EtlResult( + markdown_content=content, + etl_service="AUDIO", + content_type="audio", + ) + return result - if category == FileCategory.AUDIO: - content = await transcribe_audio(request.file_path, request.filename) - return EtlResult( - markdown_content=content, - etl_service="AUDIO", - content_type="audio", - ) + if category == FileCategory.IMAGE: + result = await self._extract_image(request) + return result - if category == FileCategory.IMAGE: - return await self._extract_image(request) - - return await self._extract_document(request) + result = await self._extract_document(request) + return result + except Exception: + status = "error" + raise + finally: + with contextlib.suppress(Exception): + if result is not None: + sp.set_attribute("etl.service", result.etl_service) + sp.set_attribute("content.type", result.content_type) + sp.set_attribute("etl.status", status) + ot_metrics.record_etl_extract_duration( + time.perf_counter() - start, + etl_service=result.etl_service if result else None, + content_type=result.content_type if result else category.value, + status=status, + ) + ot_metrics.record_etl_extract_outcome( + etl_service=result.etl_service if result else None, + content_type=result.content_type if result else category.value, + status=status, + ) async def _extract_image(self, request: EtlRequest) -> EtlResult: if self._vision_llm: try: from app.etl_pipeline.parsers.vision_llm import parse_with_vision_llm - content = await parse_with_vision_llm( - request.file_path, request.filename, self._vision_llm - ) + with ot.etl_parse_span( + etl_service="VISION_LLM", + content_type="image", + file_extension=_file_extension(request.filename), + ) as sp: + content = await parse_with_vision_llm( + request.file_path, request.filename, self._vision_llm + ) + sp.set_attribute("etl.status", "success") return EtlResult( markdown_content=content, etl_service="VISION_LLM", @@ -94,7 +141,11 @@ class EtlPipelineService: ) try: - return await self._extract_document(request) + with ot.etl_ocr_span( + etl_service=app_config.ETL_SERVICE, + file_extension=_file_extension(request.filename), + ): + return await self._extract_document(request) except (EtlUnsupportedFileError, EtlServiceUnavailableError): raise EtlUnsupportedFileError( f"Cannot process image {request.filename}: vision LLM " @@ -121,18 +172,27 @@ class EtlPipelineService: f"File type {ext} is not supported by {etl_service}" ) - if etl_service == "DOCLING": - from app.etl_pipeline.parsers.docling import parse_with_docling + with ot.etl_parse_span( + etl_service=etl_service, + content_type="document", + file_extension=ext, + processing_mode=request.processing_mode.value, + ) as sp: + if etl_service == "DOCLING": + from app.etl_pipeline.parsers.docling import parse_with_docling - content = await parse_with_docling(request.file_path, request.filename) - elif etl_service == "UNSTRUCTURED": - from app.etl_pipeline.parsers.unstructured import parse_with_unstructured + content = await parse_with_docling(request.file_path, request.filename) + elif etl_service == "UNSTRUCTURED": + from app.etl_pipeline.parsers.unstructured import ( + parse_with_unstructured, + ) - content = await parse_with_unstructured(request.file_path) - elif etl_service == "LLAMACLOUD": - content = await self._extract_with_llamacloud(request) - else: - raise EtlServiceUnavailableError(f"Unknown ETL_SERVICE: {etl_service}") + content = await parse_with_unstructured(request.file_path) + elif etl_service == "LLAMACLOUD": + content = await self._extract_with_llamacloud(request) + else: + raise EtlServiceUnavailableError(f"Unknown ETL_SERVICE: {etl_service}") + sp.set_attribute("etl.status", "success") # When the operator opts into vision-LLM at ingest, walk the # original file's embedded images and append a structured @@ -171,9 +231,14 @@ class EtlPipelineService: async def _ocr_image(image_path: str, image_name: str) -> str: try: sub = EtlPipelineService(vision_llm=None) - ocr_result = await sub.extract( - EtlRequest(file_path=image_path, filename=image_name) - ) + with ot.etl_picture_ocr_span( + file_extension=_file_extension(image_name) + ) as sp: + ocr_result = await sub.extract( + EtlRequest(file_path=image_path, filename=image_name) + ) + sp.set_attribute("etl.service", ocr_result.etl_service) + sp.set_attribute("etl.status", "success") except ( EtlUnsupportedFileError, EtlServiceUnavailableError, @@ -186,12 +251,19 @@ class EtlPipelineService: return ocr_result.markdown_content try: - result = await describe_pictures( - request.file_path, - request.filename, - self._vision_llm, - ocr_runner=_ocr_image, - ) + with ot.etl_picture_describe_span() as sp: + result = await describe_pictures( + request.file_path, + request.filename, + self._vision_llm, + ocr_runner=_ocr_image, + ) + sp.set_attribute("image.described.count", len(result.descriptions)) + sp.set_attribute("image.failed.count", result.failed) + sp.set_attribute("image.skipped.too_small", result.skipped_too_small) + sp.set_attribute("image.skipped.too_large", result.skipped_too_large) + sp.set_attribute("image.skipped.duplicate", result.skipped_duplicate) + sp.set_attribute("etl.status", "success") except Exception: # Picture description is additive; never let it fail an # otherwise-successful document extraction. From 7c07c220fc3dc7b0935cf080b5dbd9a2310a561f Mon Sep 17 00:00:00 2001 From: Anish Sarkar <104695310+AnishSarkar22@users.noreply.github.com> Date: Fri, 22 May 2026 13:49:59 +0530 Subject: [PATCH 17/28] feat(connectors): add connector sync spans --- .../app/tasks/celery_tasks/connector_tasks.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/surfsense_backend/app/tasks/celery_tasks/connector_tasks.py b/surfsense_backend/app/tasks/celery_tasks/connector_tasks.py index 86296c3d9..e0f0f09c9 100644 --- a/surfsense_backend/app/tasks/celery_tasks/connector_tasks.py +++ b/surfsense_backend/app/tasks/celery_tasks/connector_tasks.py @@ -8,7 +8,7 @@ from collections.abc import Awaitable, Callable from celery import current_task from app.celery_app import celery_app -from app.observability import metrics as ot_metrics +from app.observability import metrics as ot_metrics, otel as ot from app.tasks.celery_tasks import ( get_celery_session_maker, run_async_celery_task as _run_async_celery_task, @@ -23,9 +23,14 @@ def run_async_celery_task[T](coro_factory: Callable[[], Awaitable[T]]) -> T: t0 = time.perf_counter() status = "failed" try: - result = _run_async_celery_task(coro_factory) + with ot.connector_sync_span(connector_type=task_name) as sp: + result = _run_async_celery_task(coro_factory) + sp.set_attribute("connector.status", "success") status = "success" return result + except Exception: + status = "failed" + raise finally: elapsed_s = time.perf_counter() - t0 ot_metrics.record_connector_sync_duration( From 87a4dcfd054d8f0d83d04bde156e047aceba6e46 Mon Sep 17 00:00:00 2001 From: Anish Sarkar <104695310+AnishSarkar22@users.noreply.github.com> Date: Fri, 22 May 2026 13:50:32 +0530 Subject: [PATCH 18/28] feat(tasks): record indexing heartbeat metrics --- .../app/routes/search_source_connectors_routes.py | 15 +++++++++++++++ .../app/tasks/celery_tasks/document_tasks.py | 5 +++++ 2 files changed, 20 insertions(+) diff --git a/surfsense_backend/app/routes/search_source_connectors_routes.py b/surfsense_backend/app/routes/search_source_connectors_routes.py index 1338fe16b..4d7e6b2ef 100644 --- a/surfsense_backend/app/routes/search_source_connectors_routes.py +++ b/surfsense_backend/app/routes/search_source_connectors_routes.py @@ -43,6 +43,7 @@ from app.db import ( async_session_maker, get_async_session, ) +from app.observability import metrics as ot_metrics from app.schemas import ( GoogleDriveIndexRequest, MCPConnectorCreate, @@ -104,7 +105,9 @@ async def _run_indexing_heartbeat_loop(notification_id: int) -> None: await asyncio.sleep(HEARTBEAT_REFRESH_INTERVAL) try: get_heartbeat_redis_client().setex(key, HEARTBEAT_TTL_SECONDS, "alive") + ot_metrics.record_celery_heartbeat_refresh(heartbeat_type="connector") except Exception as e: + ot_metrics.record_celery_heartbeat_failure(heartbeat_type="connector") logger.warning( f"Failed to refresh Redis heartbeat for notification " f"{notification_id}: {e}" @@ -1338,7 +1341,13 @@ async def _run_indexing_with_notifications( get_heartbeat_redis_client().setex( heartbeat_key, HEARTBEAT_TTL_SECONDS, "0" ) + ot_metrics.record_celery_heartbeat_refresh( + heartbeat_type="connector" + ) except Exception as e: + ot_metrics.record_celery_heartbeat_failure( + heartbeat_type="connector" + ) logger.warning(f"Failed to set initial Redis heartbeat: {e}") # Start a background coroutine that refreshes the @@ -1397,8 +1406,14 @@ async def _run_indexing_with_notifications( get_heartbeat_redis_client().setex( heartbeat_key, HEARTBEAT_TTL_SECONDS, str(indexed_count) ) + ot_metrics.record_celery_heartbeat_refresh( + heartbeat_type="connector" + ) except Exception as e: # Don't let Redis errors break the indexing + ot_metrics.record_celery_heartbeat_failure( + heartbeat_type="connector" + ) logger.warning(f"Failed to set Redis heartbeat: {e}") try: diff --git a/surfsense_backend/app/tasks/celery_tasks/document_tasks.py b/surfsense_backend/app/tasks/celery_tasks/document_tasks.py index c78e376bd..1f9609968 100644 --- a/surfsense_backend/app/tasks/celery_tasks/document_tasks.py +++ b/surfsense_backend/app/tasks/celery_tasks/document_tasks.py @@ -9,6 +9,7 @@ from uuid import UUID from app.celery_app import celery_app from app.config import config +from app.observability import metrics as ot_metrics from app.services.notification_service import NotificationService from app.services.task_logging_service import TaskLoggingService from app.tasks.celery_tasks import get_celery_session_maker, run_async_celery_task @@ -59,7 +60,9 @@ def _start_heartbeat(notification_id: int) -> None: try: key = _get_heartbeat_key(notification_id) _get_doc_heartbeat_redis().setex(key, HEARTBEAT_TTL_SECONDS, "started") + ot_metrics.record_celery_heartbeat_refresh(heartbeat_type="document") except Exception as e: + ot_metrics.record_celery_heartbeat_failure(heartbeat_type="document") logger.warning( f"Failed to set initial heartbeat for notification {notification_id}: {e}" ) @@ -87,7 +90,9 @@ async def _run_heartbeat_loop(notification_id: int): await asyncio.sleep(HEARTBEAT_REFRESH_INTERVAL) try: _get_doc_heartbeat_redis().setex(key, HEARTBEAT_TTL_SECONDS, "alive") + ot_metrics.record_celery_heartbeat_refresh(heartbeat_type="document") except Exception as e: + ot_metrics.record_celery_heartbeat_failure(heartbeat_type="document") logger.warning( f"Failed to refresh heartbeat for notification {notification_id}: {e}" ) From 0fe2bba5a8f41459ada4e3648b811d20e8652a13 Mon Sep 17 00:00:00 2001 From: Anish Sarkar <104695310+AnishSarkar22@users.noreply.github.com> Date: Fri, 22 May 2026 17:46:35 +0530 Subject: [PATCH 19/28] refactor(chat): replace TooltipIconButton with Button for scroll functionality --- .../components/assistant-ui/chat-viewport.tsx | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/surfsense_web/components/assistant-ui/chat-viewport.tsx b/surfsense_web/components/assistant-ui/chat-viewport.tsx index cb0b484ef..cb0fd2005 100644 --- a/surfsense_web/components/assistant-ui/chat-viewport.tsx +++ b/surfsense_web/components/assistant-ui/chat-viewport.tsx @@ -3,16 +3,19 @@ import { ThreadPrimitive } from "@assistant-ui/react"; import { ArrowDownIcon } from "lucide-react"; import type { FC, ReactNode } from "react"; -import { TooltipIconButton } from "@/components/assistant-ui/tooltip-icon-button"; +import { Button } from "@/components/ui/button"; const ChatScrollToBottom: FC = () => ( - - + ); From dbb652d4f87f502e3224cc40ee50e2d80ea4ec2b Mon Sep 17 00:00:00 2001 From: Anish Sarkar <104695310+AnishSarkar22@users.noreply.github.com> Date: Fri, 22 May 2026 17:48:01 +0530 Subject: [PATCH 20/28] feat(observability): add telemetry error and event helpers --- .../app/observability/metrics.py | 104 ++++++++++++++++-- surfsense_backend/app/observability/otel.py | 44 ++++++++ .../tests/unit/observability/test_helpers.py | 101 +++++++++++++++++ 3 files changed, 241 insertions(+), 8 deletions(-) create mode 100644 surfsense_backend/tests/unit/observability/test_helpers.py diff --git a/surfsense_backend/app/observability/metrics.py b/surfsense_backend/app/observability/metrics.py index 53beb959d..798a6e2f7 100644 --- a/surfsense_backend/app/observability/metrics.py +++ b/surfsense_backend/app/observability/metrics.py @@ -20,6 +20,19 @@ logger = logging.getLogger(__name__) _INSTRUMENTATION_NAME = "surfsense.platform" _OBSERVABLES_REGISTERED = False +_ERROR_CATEGORY_UNKNOWN = "unknown" + +_ERROR_CATEGORY_HINTS: tuple[tuple[str, tuple[str, ...]], ...] = ( + ("rate_limited", ("ratelimit", "rate_limit", "toomanyrequests", "429")), + ("auth_failed", ("authentication", "auth", "unauthorized", "forbidden")), + ("quota_exhausted", ("quota", "insufficient", "credit", "billing")), + ("timeout", ("timeout", "timedout", "deadline")), + ("network_failed", ("connection", "connect", "network", "dns", "socket")), + ("server_error", ("internalserver", "serviceunavailable", "badgateway", "gateway")), + ("lock_contention", ("lock", "busy", "contention", "alreadyrunning")), + ("unsupported_format", ("unsupported", "format", "filetype")), + ("provider_error", ("provider", "apierror", "apistatus", "badrequest")), +) def _package_version() -> str: @@ -47,6 +60,36 @@ def _clean_attrs(attrs: dict[str, Any]) -> dict[str, str | int | float | bool]: return cleaned +def _attrs_with_optional_error_category( + attrs: dict[str, Any], error_category: str | None +) -> dict[str, Any]: + if error_category: + return {**attrs, "error.category": error_category} + return attrs + + +def categorize_exception(exc: BaseException | None) -> str: + """Return a low-cardinality category for an exception.""" + if exc is None: + return _ERROR_CATEGORY_UNKNOWN + haystack = " ".join( + cls.__name__.replace("-", "").replace("_", "").lower() + for cls in type(exc).__mro__ + ) + for category, hints in _ERROR_CATEGORY_HINTS: + if any(hint in haystack for hint in hints): + return category + return _ERROR_CATEGORY_UNKNOWN + + +def parse_celery_task_label(task_name: str | None) -> str: + """Return the operation token from a Celery task name.""" + if not task_name: + return "unknown" + operation = str(task_name).split("_", 1)[0].strip() + return operation or "unknown" + + def _record(callable_obj: Any, value: int | float, attrs: dict[str, Any]) -> None: if not _is_enabled(): return @@ -262,6 +305,15 @@ def _celery_heartbeat_failures(): ) +@lru_cache(maxsize=1) +def _celery_queue_latency(): + return _get_meter().create_histogram( + "surfsense.celery.queue.latency", + unit="s", + description="Time SurfSense Celery tasks spend waiting in queue.", + ) + + def record_model_call_duration( duration_ms: float, *, model: str | None, provider: str | None ) -> None: @@ -359,11 +411,16 @@ def record_connector_sync_duration( ) -def record_connector_sync_outcome(*, connector_type: str | None, status: str) -> None: +def record_connector_sync_outcome( + *, connector_type: str | None, status: str, error_category: str | None = None +) -> None: _add( _connector_sync_outcome(), 1, - {"connector.type": connector_type or "unknown", "status": status}, + _attrs_with_optional_error_category( + {"connector.type": connector_type or "unknown", "status": status}, + error_category, + ), ) @@ -398,11 +455,15 @@ def record_chat_request_outcome( flow: str, outcome: str, agent_mode: str | None = None, + error_category: str | None = None, ) -> None: _add( _chat_request_outcome(), 1, - {"chat.flow": flow, "outcome": outcome, "agent.mode": agent_mode}, + _attrs_with_optional_error_category( + {"chat.flow": flow, "outcome": outcome, "agent.mode": agent_mode}, + error_category, + ), ) @@ -464,15 +525,19 @@ def record_etl_extract_outcome( etl_service: str | None, content_type: str | None, status: str, + error_category: str | None = None, ) -> None: _add( _etl_extract_outcome(), 1, - { - "etl.service": etl_service or "unknown", - "content.type": content_type or "unknown", - "status": status, - }, + _attrs_with_optional_error_category( + { + "etl.service": etl_service or "unknown", + "content.type": content_type or "unknown", + "status": status, + }, + error_category, + ), ) @@ -484,6 +549,26 @@ def record_celery_heartbeat_failure(*, heartbeat_type: str) -> None: _add(_celery_heartbeat_failures(), 1, {"heartbeat.type": heartbeat_type}) +def record_celery_queue_latency( + duration_s: float, + *, + task_name: str | None, + queue: str | None, + scheduled: bool, + operation: str | None, +) -> None: + _record( + _celery_queue_latency(), + duration_s, + { + "task.name": task_name or "unknown", + "task.queue": queue or "unknown", + "task.scheduled": bool(scheduled), + "operation": operation or "unknown", + }, + ) + + def _runtime_snapshot_value(key: str, transform: Any = None) -> list[Any]: from opentelemetry.metrics import Observation @@ -569,9 +654,12 @@ def register_runtime_observables() -> None: __all__ = [ + "categorize_exception", + "parse_celery_task_label", "record_auth_failure", "record_celery_heartbeat_failure", "record_celery_heartbeat_refresh", + "record_celery_queue_latency", "record_chat_request_duration", "record_chat_request_outcome", "record_compaction_run", diff --git a/surfsense_backend/app/observability/otel.py b/surfsense_backend/app/observability/otel.py index e4d4a1fd9..ad2178f39 100644 --- a/surfsense_backend/app/observability/otel.py +++ b/surfsense_backend/app/observability/otel.py @@ -92,6 +92,48 @@ def is_enabled() -> bool: return _ENABLED +def _clean_event_attrs(attrs: dict[str, Any]) -> dict[str, str | int | float | bool]: + """Coerce event attributes to OTel-safe scalar values.""" + cleaned: dict[str, str | int | float | bool] = {} + for key, value in attrs.items(): + if value is None: + continue + if isinstance(value, bool | int | float): + cleaned[key] = value + continue + text = str(value) + if text: + cleaned[key] = text + return cleaned + + +def add_event(name: str, attributes: dict[str, Any] | None = None) -> None: + """Attach an event to the current active span. + + This is intentionally no-op and exception-safe when OTel is disabled, + unavailable, or no span is currently recording. + """ + if not _ENABLED or _ot_trace is None: + return + with contextlib.suppress(Exception): + sp = _ot_trace.get_current_span() + if sp is None or not sp.is_recording(): + return + sp.add_event( + name, + attributes=_clean_event_attrs(attributes) if attributes else None, + ) + + +def record_error(span_obj: Any, exc: BaseException) -> None: + """Record an exception and mark a span as errored without re-raising.""" + if not _ENABLED: + return + with contextlib.suppress(Exception): + span_obj.record_exception(exc) + span_obj.set_status(_OtStatus(_OtStatusCode.ERROR, str(exc))) + + def _get_tracer(): if not _OTEL_AVAILABLE: return None @@ -452,6 +494,7 @@ def reload_for_tests() -> bool: __all__ = [ + "add_event", "chat_request_span", "compaction_span", "connector_sync_span", @@ -466,6 +509,7 @@ __all__ = [ "kb_search_span", "model_call_span", "permission_asked_span", + "record_error", "reload_for_tests", "span", "subagent_invoke_span", diff --git a/surfsense_backend/tests/unit/observability/test_helpers.py b/surfsense_backend/tests/unit/observability/test_helpers.py new file mode 100644 index 000000000..ae60c1939 --- /dev/null +++ b/surfsense_backend/tests/unit/observability/test_helpers.py @@ -0,0 +1,101 @@ +"""Tests for pure observability helper functions.""" + +from __future__ import annotations + +import pytest + +from app.observability import metrics as ot_metrics, otel as ot + +pytestmark = pytest.mark.unit + + +@pytest.fixture(autouse=True) +def _disable_otel(monkeypatch: pytest.MonkeyPatch): + monkeypatch.delenv("OTEL_EXPORTER_OTLP_ENDPOINT", raising=False) + monkeypatch.setenv("SURFSENSE_DISABLE_OTEL", "true") + ot.reload_for_tests() + yield + ot.reload_for_tests() + + +@pytest.mark.parametrize( + ("task_name", "expected"), + [ + ("reindex_document", "reindex"), + ("delete_document_background", "delete"), + ("delete_folder_documents_background", "delete"), + ("delete_search_space_background", "delete"), + ("process_extension_document", "process"), + ("process_youtube_video", "process"), + ("process_file_upload", "process"), + ("process_file_upload_with_document", "process"), + ("process_circleback_meeting", "process"), + ("generate_video_presentation", "generate"), + ("generate_content_podcast", "generate"), + ("cleanup_stale_indexing_notifications", "cleanup"), + ("reconcile_pending_stripe_page_purchases", "reconcile"), + ("reconcile_pending_stripe_token_purchases", "reconcile"), + ("check_periodic_schedules", "check"), + ("ai_sort_search_space", "ai"), + ("index_notion_pages", "index"), + ("index_github_repos", "index"), + ("index_google_drive_files", "index"), + ("index_composio_connector", "index"), + ("index_obsidian_attachment", "index"), + ("index_local_folder", "index"), + ("index_uploaded_folder_files", "index"), + ("noseparator", "noseparator"), + ("", "unknown"), + ], +) +def test_parse_celery_task_label(task_name: str, expected: str) -> None: + assert ot_metrics.parse_celery_task_label(task_name) == expected + + +def test_parse_celery_task_label_handles_none() -> None: + assert ot_metrics.parse_celery_task_label(None) == "unknown" + + +@pytest.mark.parametrize( + ("exc", "expected"), + [ + (type("RateLimitError", (Exception,), {})(), "rate_limited"), + (type("AuthenticationError", (Exception,), {})(), "auth_failed"), + (type("QuotaInsufficientError", (Exception,), {})(), "quota_exhausted"), + (TimeoutError(), "timeout"), + (type("APIConnectionError", (Exception,), {})(), "network_failed"), + (type("ServiceUnavailableError", (Exception,), {})(), "server_error"), + (type("LockContentionError", (Exception,), {})(), "lock_contention"), + (type("UnsupportedFormatError", (Exception,), {})(), "unsupported_format"), + (type("ProviderError", (Exception,), {})(), "provider_error"), + (RuntimeError("plain"), "unknown"), + ], +) +def test_categorize_exception(exc: BaseException, expected: str) -> None: + assert ot_metrics.categorize_exception(exc) == expected + + +def test_record_celery_queue_latency_noops_when_disabled() -> None: + ot_metrics.record_celery_queue_latency( + 0.5, + task_name="index_notion_pages", + queue="surfsense.connectors", + scheduled=False, + operation="index", + ) + + +def test_add_event_noops_when_disabled() -> None: + ot.add_event("test.event", {"value": 1}) + + +def test_add_event_noops_without_current_span(monkeypatch: pytest.MonkeyPatch) -> None: + class FakeTrace: + @staticmethod + def get_current_span(): + return None + + monkeypatch.setattr(ot, "_ENABLED", True) + monkeypatch.setattr(ot, "_ot_trace", FakeTrace()) + + ot.add_event("test.event", {"value": 1}) From dc893281ba8ed27e55fbb0a89b88a9096b2e3722 Mon Sep 17 00:00:00 2001 From: Anish Sarkar <104695310+AnishSarkar22@users.noreply.github.com> Date: Fri, 22 May 2026 17:48:43 +0530 Subject: [PATCH 21/28] feat(chat): add model retry and stream lifecycle events --- .../agents/new_chat/middleware/retry_after.py | 20 ++++ .../middleware/scoped_model_fallback.py | 24 ++++- .../app/tasks/chat/stream_new_chat.py | 100 +++++++++++++++++- 3 files changed, 138 insertions(+), 6 deletions(-) diff --git a/surfsense_backend/app/agents/new_chat/middleware/retry_after.py b/surfsense_backend/app/agents/new_chat/middleware/retry_after.py index 0c3d3d017..321185dee 100644 --- a/surfsense_backend/app/agents/new_chat/middleware/retry_after.py +++ b/surfsense_backend/app/agents/new_chat/middleware/retry_after.py @@ -45,6 +45,8 @@ from langchain.agents.middleware.types import ( from langchain_core.callbacks import adispatch_custom_event, dispatch_custom_event from langchain_core.messages import AIMessage +from app.observability import metrics as ot_metrics, otel as ot + logger = logging.getLogger(__name__) # Names of exception classes for which a retry would not help — context @@ -198,6 +200,15 @@ class RetryAfterMiddleware(AgentMiddleware[AgentState[ResponseT], ContextT, Resp if not self._should_retry(exc) or attempt >= self.max_retries: raise delay = self._delay_for_attempt(attempt, exc) + ot.add_event( + "model.retry.scheduled", + { + "retry.attempt": attempt + 1, + "retry.max": self.max_retries, + "retry.delay_ms": int(delay * 1000), + "retry.reason": ot_metrics.categorize_exception(exc), + }, + ) try: dispatch_custom_event( "surfsense.retrying", @@ -231,6 +242,15 @@ class RetryAfterMiddleware(AgentMiddleware[AgentState[ResponseT], ContextT, Resp if not self._should_retry(exc) or attempt >= self.max_retries: raise delay = self._delay_for_attempt(attempt, exc) + ot.add_event( + "model.retry.scheduled", + { + "retry.attempt": attempt + 1, + "retry.max": self.max_retries, + "retry.delay_ms": int(delay * 1000), + "retry.reason": ot_metrics.categorize_exception(exc), + }, + ) try: await adispatch_custom_event( "surfsense.retrying", diff --git a/surfsense_backend/app/agents/new_chat/middleware/scoped_model_fallback.py b/surfsense_backend/app/agents/new_chat/middleware/scoped_model_fallback.py index 99eb2d74a..0294e2839 100644 --- a/surfsense_backend/app/agents/new_chat/middleware/scoped_model_fallback.py +++ b/surfsense_backend/app/agents/new_chat/middleware/scoped_model_fallback.py @@ -6,6 +6,8 @@ from typing import TYPE_CHECKING, Any from langchain.agents.middleware import ModelFallbackMiddleware +from app.observability import metrics as ot_metrics, otel as ot + if TYPE_CHECKING: from collections.abc import Awaitable, Callable @@ -55,7 +57,16 @@ class ScopedModelFallbackMiddleware(ModelFallbackMiddleware): raise last_exception = e - for fallback_model in self.models: + for attempt, fallback_model in enumerate(self.models, start=1): + ot.add_event( + "model.fallback", + { + "fallback.attempt": attempt, + "fallback.from": attempt - 1, + "fallback.to": attempt, + "fallback.reason": ot_metrics.categorize_exception(last_exception), + }, + ) try: return handler(request.override(model=fallback_model)) except Exception as e: @@ -79,7 +90,16 @@ class ScopedModelFallbackMiddleware(ModelFallbackMiddleware): raise last_exception = e - for fallback_model in self.models: + for attempt, fallback_model in enumerate(self.models, start=1): + ot.add_event( + "model.fallback", + { + "fallback.attempt": attempt, + "fallback.from": attempt - 1, + "fallback.to": attempt, + "fallback.reason": ot_metrics.categorize_exception(last_exception), + }, + ) try: return await handler(request.override(model=fallback_model)) except Exception as e: diff --git a/surfsense_backend/app/tasks/chat/stream_new_chat.py b/surfsense_backend/app/tasks/chat/stream_new_chat.py index b961733b4..fee50d72d 100644 --- a/surfsense_backend/app/tasks/chat/stream_new_chat.py +++ b/surfsense_backend/app/tasks/chat/stream_new_chat.py @@ -14,6 +14,7 @@ import contextlib import gc import json import logging +import sys import time from collections.abc import AsyncGenerator from dataclasses import dataclass, field @@ -886,6 +887,7 @@ async def stream_new_chat( stream_result.client_platform = fs_platform chat_agent_mode = "unknown" chat_outcome = "success" + chat_error_category: str | None = None chat_span_cm = ot.chat_request_span( chat_id=chat_id, search_space_id=search_space_id, @@ -985,6 +987,14 @@ async def stream_new_chat( requires_image_input=_requires_image_input, ) ).resolved_llm_config_id + ot.add_event( + "model.pin.resolved", + { + "pin.requested_id": requested_llm_config_id, + "pin.resolved_id": llm_config_id, + "pin.requires_image_input": _requires_image_input, + }, + ) except ValueError as pin_error: # Auto-pin's "no vision-capable cfg" path raises a ValueError # whose message we map to the friendly image-input SSE error @@ -1001,6 +1011,13 @@ async def stream_new_chat( if error_code == "MODEL_DOES_NOT_SUPPORT_IMAGE_INPUT" else "server_error" ) + if error_code == "MODEL_DOES_NOT_SUPPORT_IMAGE_INPUT": + ot.add_event( + "quota.denied", + { + "quota.code": error_code, + }, + ) yield _emit_stream_error( message=str(pin_error), error_kind=error_kind, @@ -1055,6 +1072,12 @@ async def stream_new_chat( model_label = ( agent_config.config_name or agent_config.model_name or "model" ) + ot.add_event( + "quota.denied", + { + "quota.code": "MODEL_DOES_NOT_SUPPORT_IMAGE_INPUT", + }, + ) yield _emit_stream_error( message=( f"The selected model ({model_label}) does not support " @@ -1098,6 +1121,12 @@ async def stream_new_chat( ) _premium_reserved_micros = reserve_amount_micros if not quota_result.allowed: + ot.add_event( + "quota.denied", + { + "quota.code": "PREMIUM_QUOTA_EXHAUSTED", + }, + ) if requested_llm_config_id == 0: try: llm_config_id = ( @@ -1111,6 +1140,13 @@ async def stream_new_chat( requires_image_input=_requires_image_input, ) ).resolved_llm_config_id + ot.add_event( + "model.repin", + { + "repin.reason": "premium_quota_exhausted", + "repin.to_config_id": llm_config_id, + }, + ) except ValueError as pin_error: yield _emit_stream_error( message=str(pin_error), @@ -1880,6 +1916,14 @@ async def stream_new_chat( llm_config_id, time.perf_counter() - _t0, ) + ot.add_event( + "chat.rate_limit.recovered", + { + "recovery.reason": "provider_rate_limited", + "recovery.previous_config_id": previous_config_id, + "recovery.fallback_config_id": llm_config_id, + }, + ) _log_chat_stream_error( flow=flow, error_kind="rate_limited", @@ -1910,6 +1954,12 @@ async def stream_new_chat( log_system_snapshot("stream_new_chat_END") if stream_result.is_interrupted: + ot.add_event( + "chat.interrupted", + { + "chat.flow": flow, + }, + ) if title_task is not None and not title_task.done(): title_task.cancel() @@ -2029,9 +2079,11 @@ async def stream_new_chat( error_extra, ) = _classify_stream_exception(e, flow_label="chat") chat_outcome = error_code or error_kind or "error" + chat_error_category = ot_metrics.categorize_exception(e) with contextlib.suppress(Exception): chat_span.set_attribute("chat.outcome", chat_outcome) - chat_span.record_exception(e) + chat_span.set_attribute("error.category", chat_error_category) + ot.record_error(chat_span, e) error_message = f"Error during chat: {e!s}" print(f"[stream_new_chat] {error_message}") print(f"[stream_new_chat] Exception type: {type(e).__name__}") @@ -2234,8 +2286,9 @@ async def stream_new_chat( flow=flow, outcome=chat_outcome, agent_mode=chat_agent_mode, + error_category=chat_error_category, ) - chat_span_cm.__exit__(None, None, None) + chat_span_cm.__exit__(*sys.exc_info()) async def stream_resume_chat( @@ -2262,6 +2315,7 @@ async def stream_resume_chat( stream_result.client_platform = fs_platform chat_agent_mode = "unknown" chat_outcome = "success" + chat_error_category: str | None = None chat_span_cm = ot.chat_request_span( chat_id=chat_id, search_space_id=search_space_id, @@ -2345,6 +2399,14 @@ async def stream_resume_chat( selected_llm_config_id=llm_config_id, ) ).resolved_llm_config_id + ot.add_event( + "model.pin.resolved", + { + "pin.requested_id": requested_llm_config_id, + "pin.resolved_id": llm_config_id, + "pin.requires_image_input": False, + }, + ) except ValueError as pin_error: yield _emit_stream_error( message=str(pin_error), @@ -2401,6 +2463,12 @@ async def stream_resume_chat( ) _resume_premium_reserved_micros = reserve_amount_micros if not quota_result.allowed: + ot.add_event( + "quota.denied", + { + "quota.code": "PREMIUM_QUOTA_EXHAUSTED", + }, + ) if requested_llm_config_id == 0: try: llm_config_id = ( @@ -2413,6 +2481,13 @@ async def stream_resume_chat( force_repin_free=True, ) ).resolved_llm_config_id + ot.add_event( + "model.repin", + { + "repin.reason": "premium_quota_exhausted", + "repin.to_config_id": llm_config_id, + }, + ) except ValueError as pin_error: yield _emit_stream_error( message=str(pin_error), @@ -2748,6 +2823,14 @@ async def stream_resume_chat( llm_config_id, time.perf_counter() - _t0, ) + ot.add_event( + "chat.rate_limit.recovered", + { + "recovery.reason": "provider_rate_limited", + "recovery.previous_config_id": previous_config_id, + "recovery.fallback_config_id": llm_config_id, + }, + ) _log_chat_stream_error( flow="resume", error_kind="rate_limited", @@ -2775,6 +2858,12 @@ async def stream_resume_chat( chat_id, ) if stream_result.is_interrupted: + ot.add_event( + "chat.interrupted", + { + "chat.flow": "resume", + }, + ) usage_summary = accumulator.per_message_summary() _perf_log.info( "[token_usage] interrupted resume_chat: calls=%d total=%d cost_micros=%d summary=%s", @@ -2869,9 +2958,11 @@ async def stream_resume_chat( error_extra, ) = _classify_stream_exception(e, flow_label="resume") chat_outcome = error_code or error_kind or "error" + chat_error_category = ot_metrics.categorize_exception(e) with contextlib.suppress(Exception): chat_span.set_attribute("chat.outcome", chat_outcome) - chat_span.record_exception(e) + chat_span.set_attribute("error.category", chat_error_category) + ot.record_error(chat_span, e) error_message = f"Error during resume: {e!s}" print(f"[stream_resume_chat] {error_message}") print(f"[stream_resume_chat] Traceback:\n{traceback.format_exc()}") @@ -3033,5 +3124,6 @@ async def stream_resume_chat( flow="resume", outcome=chat_outcome, agent_mode=chat_agent_mode, + error_category=chat_error_category, ) - chat_span_cm.__exit__(None, None, None) + chat_span_cm.__exit__(*sys.exc_info()) From 6e03ab044a7ad10ead96a22ced444bfa5672bf0c Mon Sep 17 00:00:00 2001 From: Anish Sarkar <104695310+AnishSarkar22@users.noreply.github.com> Date: Fri, 22 May 2026 17:49:02 +0530 Subject: [PATCH 22/28] feat(tasks): measure Celery queue latency --- surfsense_backend/app/celery_app.py | 61 ++++++++++++++++++++++++++++- 1 file changed, 60 insertions(+), 1 deletion(-) diff --git a/surfsense_backend/app/celery_app.py b/surfsense_backend/app/celery_app.py index cfb24731d..39406fb6b 100644 --- a/surfsense_backend/app/celery_app.py +++ b/surfsense_backend/app/celery_app.py @@ -1,16 +1,75 @@ """Celery application configuration and setup.""" +import contextlib import os +import time from celery import Celery from celery.schedules import crontab -from celery.signals import worker_process_init +from celery.signals import before_task_publish, task_prerun, worker_process_init from dotenv import load_dotenv +try: + from opentelemetry import trace +except ImportError: # pragma: no cover - optional OTel dependency + trace = None # type: ignore[assignment] + # Load environment variables load_dotenv() +@before_task_publish.connect +def _stamp_enqueue_time(headers=None, **_kwargs): + """Stamp enqueue time so workers can measure queue wait.""" + if headers is None: + return + with contextlib.suppress(Exception): + headers["surfsense.enqueued_at_ns"] = str(time.monotonic_ns()) + + +@task_prerun.connect +def _record_queue_latency(task=None, **_kwargs): + """Record queue latency and attach the generic operation to task spans.""" + if task is None: + return + try: + from app.observability import metrics as ot_metrics + + task_name = getattr(task, "name", None) or "unknown" + operation = ot_metrics.parse_celery_task_label(task_name) + request = getattr(task, "request", None) + delivery_info = getattr(request, "delivery_info", None) or {} + queue = delivery_info.get("routing_key") or "unknown" + scheduled = bool( + getattr(request, "eta", None) or getattr(request, "expires", None) + ) + + if trace is not None: + with contextlib.suppress(Exception): + span = trace.get_current_span() + span.set_attribute("celery.task.operation", operation) + + headers = getattr(request, "headers", None) or {} + enqueued_ns = headers.get("surfsense.enqueued_at_ns") + if enqueued_ns is None: + return + + elapsed_s = (time.monotonic_ns() - int(enqueued_ns)) / 1e9 + ot_metrics.record_celery_queue_latency( + elapsed_s, + task_name=task_name, + queue=queue, + scheduled=scheduled, + operation=operation, + ) + if trace is not None: + with contextlib.suppress(Exception): + span = trace.get_current_span() + span.set_attribute("celery.queue.latency_ms", elapsed_s * 1000) + except Exception: + pass + + @worker_process_init.connect def init_worker(**kwargs): """Initialize the LLM Router and Image Gen Router when a Celery worker process starts. From c4abbd6e2007b3400e3be1de504f57407cce86a6 Mon Sep 17 00:00:00 2001 From: Anish Sarkar <104695310+AnishSarkar22@users.noreply.github.com> Date: Fri, 22 May 2026 17:49:46 +0530 Subject: [PATCH 23/28] feat(pipeline): enrich ETL and indexing failure telemetry --- .../app/etl_pipeline/etl_pipeline_service.py | 48 +++++++++++++++++-- .../indexing_pipeline_service.py | 8 +++- 2 files changed, 52 insertions(+), 4 deletions(-) diff --git a/surfsense_backend/app/etl_pipeline/etl_pipeline_service.py b/surfsense_backend/app/etl_pipeline/etl_pipeline_service.py index 4cdd387b0..496c6d0c3 100644 --- a/surfsense_backend/app/etl_pipeline/etl_pipeline_service.py +++ b/surfsense_backend/app/etl_pipeline/etl_pipeline_service.py @@ -30,6 +30,7 @@ class EtlPipelineService: category = classify_file(request.filename) start = time.perf_counter() status = "success" + error_category: str | None = None result: EtlResult | None = None with ot.etl_extract_span( content_type=category.value, @@ -75,8 +76,9 @@ class EtlPipelineService: result = await self._extract_document(request) return result - except Exception: + except Exception as exc: status = "error" + error_category = ot_metrics.categorize_exception(exc) raise finally: with contextlib.suppress(Exception): @@ -94,6 +96,7 @@ class EtlPipelineService: etl_service=result.etl_service if result else None, content_type=result.content_type if result else category.value, status=status, + error_category=error_category, ) async def _extract_image(self, request: EtlRequest) -> EtlResult: @@ -134,11 +137,27 @@ class EtlPipelineService: request.filename, exc_info=True, ) + ot.add_event( + "etl.fallback", + { + "fallback.from": "vision_llm", + "fallback.to": "document_parser", + "fallback.reason": ot_metrics.categorize_exception(exc), + }, + ) else: logging.info( "No vision LLM provided, falling back to document parser for %s", request.filename, ) + ot.add_event( + "etl.fallback", + { + "fallback.from": "vision_llm", + "fallback.to": "document_parser", + "fallback.reason": "not_configured", + }, + ) try: with ot.etl_ocr_span( @@ -246,6 +265,13 @@ class EtlPipelineService: # Common case: the configured ETL service can't OCR # this image format (or no service is configured at # all). Don't spam warnings -- just no OCR for it. + ot.add_event( + "etl.ocr.skipped", + { + "skip.reason": "unsupported_format", + "error.category": ot_metrics.categorize_exception(exc), + }, + ) logging.debug("Skipping per-image OCR for %s: %s", image_name, exc) return "" return ocr_result.markdown_content @@ -264,9 +290,17 @@ class EtlPipelineService: sp.set_attribute("image.skipped.too_large", result.skipped_too_large) sp.set_attribute("image.skipped.duplicate", result.skipped_duplicate) sp.set_attribute("etl.status", "success") - except Exception: + except Exception as exc: # Picture description is additive; never let it fail an # otherwise-successful document extraction. + ot.add_event( + "etl.degraded", + { + "degraded.reason": "picture_describe_failed", + "degraded.action": "return_parser_output", + "error.category": ot_metrics.categorize_exception(exc), + }, + ) logging.warning( "Picture description failed for %s, returning parser output unchanged", request.filename, @@ -319,7 +353,15 @@ class EtlPipelineService: return await parse_with_azure_doc_intelligence( request.file_path, processing_mode=mode_value ) - except Exception: + except Exception as exc: + ot.add_event( + "etl.fallback", + { + "fallback.from": "azure_di", + "fallback.to": "llamacloud", + "fallback.reason": ot_metrics.categorize_exception(exc), + }, + ) logging.warning( "Azure Document Intelligence failed for %s, " "falling back to LlamaCloud", diff --git a/surfsense_backend/app/indexing_pipeline/indexing_pipeline_service.py b/surfsense_backend/app/indexing_pipeline/indexing_pipeline_service.py index 2aa92bd9b..282bd6034 100644 --- a/surfsense_backend/app/indexing_pipeline/indexing_pipeline_service.py +++ b/surfsense_backend/app/indexing_pipeline/indexing_pipeline_service.py @@ -2,6 +2,7 @@ import asyncio import contextlib import hashlib import logging +import sys import time from collections.abc import Awaitable, Callable from dataclasses import dataclass, field @@ -445,6 +446,7 @@ class IndexingPipelineService: await self._enqueue_ai_sort_if_enabled(document) except RETRYABLE_LLM_ERRORS as e: + ot.record_error(persist_span, e) log_retryable_llm_error(ctx, e) outcome_status = "requeued" await rollback_and_persist_failure( @@ -452,24 +454,28 @@ class IndexingPipelineService: ) except PERMANENT_LLM_ERRORS as e: + ot.record_error(persist_span, e) log_permanent_llm_error(ctx, e) await rollback_and_persist_failure( self.session, document, llm_permanent_message(e) ) except RecursionError as e: + ot.record_error(persist_span, e) log_chunking_overflow(ctx, e) await rollback_and_persist_failure( self.session, document, PipelineMessages.CHUNKING_OVERFLOW ) except EMBEDDING_ERRORS as e: + ot.record_error(persist_span, e) log_embedding_error(ctx, e) await rollback_and_persist_failure( self.session, document, embedding_message(e) ) except Exception as e: + ot.record_error(persist_span, e) log_unexpected_error(ctx, e) await rollback_and_persist_failure( self.session, document, safe_exception_message(e) @@ -488,7 +494,7 @@ class IndexingPipelineService: document_type=document_type, status=outcome_status, ) - persist_span_cm.__exit__(None, None, None) + persist_span_cm.__exit__(*sys.exc_info()) return document async def _enqueue_ai_sort_if_enabled(self, document: Document) -> None: From 7a3b278b7567041f48c06cfe09e23ca7af9a69f5 Mon Sep 17 00:00:00 2001 From: Anish Sarkar <104695310+AnishSarkar22@users.noreply.github.com> Date: Fri, 22 May 2026 17:50:02 +0530 Subject: [PATCH 24/28] feat(connectors): add retry and auth telemetry events --- .../routes/search_source_connectors_routes.py | 24 ++++++++++++++++++- .../app/tasks/celery_tasks/connector_tasks.py | 14 +++++++---- 2 files changed, 32 insertions(+), 6 deletions(-) diff --git a/surfsense_backend/app/routes/search_source_connectors_routes.py b/surfsense_backend/app/routes/search_source_connectors_routes.py index 4d7e6b2ef..3060fdf4a 100644 --- a/surfsense_backend/app/routes/search_source_connectors_routes.py +++ b/surfsense_backend/app/routes/search_source_connectors_routes.py @@ -43,7 +43,7 @@ from app.db import ( async_session_maker, get_async_session, ) -from app.observability import metrics as ot_metrics +from app.observability import metrics as ot_metrics, otel as ot from app.schemas import ( GoogleDriveIndexRequest, MCPConnectorCreate, @@ -1246,6 +1246,12 @@ async def _persist_auth_expired(session: AsyncSession, connector_id: int) -> Non """Flag a connector as auth_expired so the frontend shows a re-auth prompt.""" from sqlalchemy.orm.attributes import flag_modified + ot.add_event( + "connector.auth.expired", + { + "error.category": "auth_failed", + }, + ) try: result = await session.execute( select(SearchSourceConnector).where( @@ -1305,6 +1311,13 @@ async def _run_indexing_with_notifications( try: connector_lock_acquired = acquire_connector_indexing_lock(connector_id) if not connector_lock_acquired: + ot.add_event( + "connector.sync.skipped", + { + "skip.reason": "lock_contention", + "error.category": "lock_contention", + }, + ) logger.info( f"Skipping indexing for connector {connector_id} " "(another worker already holds Redis connector lock)" @@ -1375,6 +1388,15 @@ async def _run_indexing_with_notifications( ) -> None: """Callback to update notification during API retries (rate limits, etc.)""" nonlocal notification + ot.add_event( + "connector.retry.scheduled", + { + "retry.reason": retry_reason, + "retry.attempt": attempt, + "retry.max": max_attempts, + "retry.delay_ms": int(wait_seconds * 1000), + }, + ) if notification: try: await session.refresh(notification) diff --git a/surfsense_backend/app/tasks/celery_tasks/connector_tasks.py b/surfsense_backend/app/tasks/celery_tasks/connector_tasks.py index e0f0f09c9..50f757473 100644 --- a/surfsense_backend/app/tasks/celery_tasks/connector_tasks.py +++ b/surfsense_backend/app/tasks/celery_tasks/connector_tasks.py @@ -22,15 +22,18 @@ def run_async_celery_task[T](coro_factory: Callable[[], Awaitable[T]]) -> T: task_name = getattr(current_task, "name", None) or "unknown" t0 = time.perf_counter() status = "failed" + error_category: str | None = None try: with ot.connector_sync_span(connector_type=task_name) as sp: - result = _run_async_celery_task(coro_factory) - sp.set_attribute("connector.status", "success") + try: + result = _run_async_celery_task(coro_factory) + sp.set_attribute("connector.status", "success") + except Exception as exc: + error_category = ot_metrics.categorize_exception(exc) + sp.set_attribute("connector.error.category", error_category) + raise status = "success" return result - except Exception: - status = "failed" - raise finally: elapsed_s = time.perf_counter() - t0 ot_metrics.record_connector_sync_duration( @@ -40,6 +43,7 @@ def run_async_celery_task[T](coro_factory: Callable[[], Awaitable[T]]) -> T: ot_metrics.record_connector_sync_outcome( connector_type=task_name, status=status, + error_category=error_category, ) From 51e4d8b489dc2972c43aa63fd0ea174a36e5057c Mon Sep 17 00:00:00 2001 From: Anish Sarkar <104695310+AnishSarkar22@users.noreply.github.com> Date: Fri, 22 May 2026 18:19:38 +0530 Subject: [PATCH 25/28] feat(tasks): enhance Celery task telemetry with queue metadata and latency tracking --- surfsense_backend/app/celery_app.py | 48 +++++++++++++++++++++++------ 1 file changed, 38 insertions(+), 10 deletions(-) diff --git a/surfsense_backend/app/celery_app.py b/surfsense_backend/app/celery_app.py index 39406fb6b..5b45baca1 100644 --- a/surfsense_backend/app/celery_app.py +++ b/surfsense_backend/app/celery_app.py @@ -6,7 +6,12 @@ import time from celery import Celery from celery.schedules import crontab -from celery.signals import before_task_publish, task_prerun, worker_process_init +from celery.signals import ( + before_task_publish, + task_postrun, + task_prerun, + worker_process_init, +) from dotenv import load_dotenv try: @@ -29,7 +34,7 @@ def _stamp_enqueue_time(headers=None, **_kwargs): @task_prerun.connect def _record_queue_latency(task=None, **_kwargs): - """Record queue latency and attach the generic operation to task spans.""" + """Record queue latency and stash task metadata for span enrichment.""" if task is None: return try: @@ -44,10 +49,10 @@ def _record_queue_latency(task=None, **_kwargs): getattr(request, "eta", None) or getattr(request, "expires", None) ) - if trace is not None: - with contextlib.suppress(Exception): - span = trace.get_current_span() - span.set_attribute("celery.task.operation", operation) + with contextlib.suppress(Exception): + request.surfsense_operation = operation + request.surfsense_queue = queue + request.surfsense_scheduled = scheduled headers = getattr(request, "headers", None) or {} enqueued_ns = headers.get("surfsense.enqueued_at_ns") @@ -55,6 +60,9 @@ def _record_queue_latency(task=None, **_kwargs): return elapsed_s = (time.monotonic_ns() - int(enqueued_ns)) / 1e9 + with contextlib.suppress(Exception): + request.surfsense_queue_latency_ms = elapsed_s * 1000 + ot_metrics.record_celery_queue_latency( elapsed_s, task_name=task_name, @@ -62,10 +70,30 @@ def _record_queue_latency(task=None, **_kwargs): scheduled=scheduled, operation=operation, ) - if trace is not None: - with contextlib.suppress(Exception): - span = trace.get_current_span() - span.set_attribute("celery.queue.latency_ms", elapsed_s * 1000) + except Exception: + pass + + +@task_postrun.connect +def _set_celery_span_attributes(task=None, **_kwargs): + """Attach derived queue metadata to the active Celery run span.""" + if task is None or trace is None: + return + + try: + request = getattr(task, "request", None) + if request is None: + return + + span = trace.get_current_span() + + operation = getattr(request, "surfsense_operation", None) + if operation: + span.set_attribute("celery.task.operation", operation) + + latency_ms = getattr(request, "surfsense_queue_latency_ms", None) + if latency_ms is not None: + span.set_attribute("celery.queue.latency_ms", latency_ms) except Exception: pass From df698e021652f6472de09a1006094f415da09309 Mon Sep 17 00:00:00 2001 From: Anish Sarkar <104695310+AnishSarkar22@users.noreply.github.com> Date: Sat, 23 May 2026 00:17:23 +0530 Subject: [PATCH 26/28] feat(observability): integrate OpenTelemetry collector and configuration for enhanced telemetry --- docker/.env.example | 22 +++++ docker/docker-compose.yml | 23 ++++++ docker/otel-collector/config.yaml | 81 +++++++++++++++++++ surfsense_backend/.env.example | 7 +- .../app/observability/bootstrap.py | 7 +- .../tests/unit/observability/test_otel.py | 27 ++++++- 6 files changed, 160 insertions(+), 7 deletions(-) create mode 100644 docker/otel-collector/config.yaml diff --git a/docker/.env.example b/docker/.env.example index 4de35a5e9..2ff15e71d 100644 --- a/docker/.env.example +++ b/docker/.env.example @@ -304,6 +304,28 @@ STT_SERVICE=local/base # LANGSMITH_API_KEY= # LANGSMITH_PROJECT=surfsense +# OpenTelemetry traces and metrics. +# Enable the collector with: docker compose --profile observability up -d +# SURFSENSE_ENABLE_OTEL=true +# OTEL_EXPORTER_OTLP_ENDPOINT=http://otel-collector:4317 +# OTEL_EXPORTER_OTLP_PROTOCOL=grpc +# OTEL_RESOURCE_ATTRIBUTES=deployment.environment.name=production,service.namespace=surfsense +# +# Emergency kill switch. +# OTEL_SDK_DISABLED=true +# +# Grafana Cloud OTLP credentials. These are used only by the collector container. +# GRAFANA_CLOUD_OTLP_ENDPOINT=https://otlp-gateway-.grafana.net/otlp +# GRAFANA_CLOUD_INSTANCE_ID= +# GRAFANA_CLOUD_API_KEY= +# +# Optional host port overrides for the bundled OTel Collector. Only change +# these if the host already uses 4317/4318/13133; backend containers still use +# the internal Docker endpoint above. +# OTEL_GRPC_PORT=4317 +# OTEL_HTTP_PORT=4318 +# OTEL_HEALTH_PORT=13133 + # ------------------------------------------------------------------------------ # Advanced (optional) # ------------------------------------------------------------------------------ diff --git a/docker/docker-compose.yml b/docker/docker-compose.yml index 82d77f826..06a3ac79a 100644 --- a/docker/docker-compose.yml +++ b/docker/docker-compose.yml @@ -61,6 +61,29 @@ services: timeout: 5s retries: 5 + otel-collector: + image: otel/opentelemetry-collector-contrib:0.152.1 + profiles: + - observability + command: ["--config=/etc/otelcol/config.yaml"] + volumes: + - ./otel-collector/config.yaml:/etc/otelcol/config.yaml:ro + environment: + GRAFANA_CLOUD_OTLP_ENDPOINT: ${GRAFANA_CLOUD_OTLP_ENDPOINT:-} + GRAFANA_CLOUD_INSTANCE_ID: ${GRAFANA_CLOUD_INSTANCE_ID:-} + GRAFANA_CLOUD_API_KEY: ${GRAFANA_CLOUD_API_KEY:-} + ports: + - "${OTEL_GRPC_PORT:-4317}:4317" + - "${OTEL_HTTP_PORT:-4318}:4318" + - "${OTEL_HEALTH_PORT:-13133}:13133" + mem_limit: 2g + restart: unless-stopped + healthcheck: + test: ["CMD", "/otelcol-contrib", "--version"] + interval: 30s + timeout: 5s + retries: 3 + searxng: image: searxng/searxng:2026.3.13-3c1f68c59 volumes: diff --git a/docker/otel-collector/config.yaml b/docker/otel-collector/config.yaml new file mode 100644 index 000000000..f495eff9b --- /dev/null +++ b/docker/otel-collector/config.yaml @@ -0,0 +1,81 @@ +extensions: + health_check: + endpoint: 0.0.0.0:13133 + basicauth/grafana_cloud: + client_auth: + username: ${env:GRAFANA_CLOUD_INSTANCE_ID} + password: ${env:GRAFANA_CLOUD_API_KEY} + +receivers: + otlp: + protocols: + grpc: + endpoint: 0.0.0.0:4317 + http: + endpoint: 0.0.0.0:4318 + +processors: + # Percentage limits are calculated against the collector container memory limit. + # Keep docker-compose.yml/Coolify memory limit set for predictability. + memory_limiter: + check_interval: 1s + limit_percentage: 80 + spike_limit_percentage: 25 + + attributes/scrub: + actions: + - key: http.request.header.authorization + action: delete + - key: http.request.header.cookie + action: delete + - key: db.statement + action: delete + + tail_sampling: + decision_wait: 10s + num_traces: 50000 + expected_new_traces_per_sec: 100 + policies: + - name: errors + type: status_code + status_code: + status_codes: [ERROR] + - name: slow-requests + type: latency + latency: + threshold_ms: 500 + - name: baseline + type: probabilistic + probabilistic: + sampling_percentage: 100 + + batch: + timeout: 5s + send_batch_size: 1024 + send_batch_max_size: 2048 + +exporters: + otlp_http/grafana_cloud: + endpoint: ${env:GRAFANA_CLOUD_OTLP_ENDPOINT} + auth: + authenticator: basicauth/grafana_cloud + sending_queue: + enabled: true + queue_size: 10000 + retry_on_failure: + enabled: true + initial_interval: 5s + max_interval: 30s + max_elapsed_time: 300s + +service: + extensions: [health_check, basicauth/grafana_cloud] + pipelines: + traces: + receivers: [otlp] + processors: [memory_limiter, attributes/scrub, tail_sampling, batch] + exporters: [otlp_http/grafana_cloud] + metrics: + receivers: [otlp] + processors: [memory_limiter, batch] + exporters: [otlp_http/grafana_cloud] diff --git a/surfsense_backend/.env.example b/surfsense_backend/.env.example index 908f4645d..91b03770e 100644 --- a/surfsense_backend/.env.example +++ b/surfsense_backend/.env.example @@ -310,10 +310,9 @@ LANGSMITH_PROJECT=surfsense # use http://otel-lgtm:4317 instead. # OTEL_EXPORTER_OTLP_ENDPOINT=http://otel-collector:4317 # OTEL_EXPORTER_OTLP_PROTOCOL=grpc # or http/protobuf -# OTEL_SERVICE_NAME=surfsense-backend -# OTEL_RESOURCE_ATTRIBUTES=deployment.environment=production -# OTEL_METRIC_EXPORT_INTERVAL=60000 # ms -# OTEL_SDK_DISABLED=false # spec kill-switch +# OTEL_RESOURCE_ATTRIBUTES=deployment.environment.name=production,service.namespace=surfsense +# OTEL_METRIC_EXPORT_INTERVAL=300000 # ms; 5 minutes +# OTEL_SDK_DISABLED=true # emergency kill-switch # Skills + subagents # SURFSENSE_ENABLE_SKILLS=false diff --git a/surfsense_backend/app/observability/bootstrap.py b/surfsense_backend/app/observability/bootstrap.py index ad1d02ea8..f9ed65e7b 100644 --- a/surfsense_backend/app/observability/bootstrap.py +++ b/surfsense_backend/app/observability/bootstrap.py @@ -71,7 +71,7 @@ def _build_resource(): "service.name": os.environ.get("OTEL_SERVICE_NAME", "surfsense-backend"), "service.version": _package_version(), "service.instance.id": socket.gethostname(), - "deployment.environment": _deployment_environment(), + "deployment.environment.name": _deployment_environment(), } ) @@ -336,7 +336,10 @@ def init_logs() -> None: def _run() -> None: from opentelemetry.instrumentation.logging import LoggingInstrumentor - LoggingInstrumentor().instrument() + # Required for stdlib LogRecords to receive otelTraceID/otelSpanID. + # logging.basicConfig is already installed by main.py, so this does not + # take over formatting in normal app startup. + LoggingInstrumentor().instrument(set_logging_format=True) if _safe_instrument("logging", _run): _LOGS_INITIALIZED = True diff --git a/surfsense_backend/tests/unit/observability/test_otel.py b/surfsense_backend/tests/unit/observability/test_otel.py index be40cccde..52ccba82f 100644 --- a/surfsense_backend/tests/unit/observability/test_otel.py +++ b/surfsense_backend/tests/unit/observability/test_otel.py @@ -113,12 +113,37 @@ class TestBootstrapConfig: resource = bootstrap._build_resource() attrs = dict(resource.attributes) assert attrs["service.name"] == "custom-backend" - assert attrs["deployment.environment"] == "test" + assert attrs["deployment.environment.name"] == "test" assert attrs["service.instance.id"] def test_shutdown_is_safe_without_providers(self) -> None: bootstrap.shutdown_otel() + def test_init_logs_enables_log_correlation( + self, monkeypatch: pytest.MonkeyPatch + ) -> None: + calls: list[dict[str, object]] = [] + + class FakeLoggingInstrumentor: + def instrument(self, **kwargs: object) -> None: + calls.append(kwargs) + + def fake_safe_instrument(name: str, callback): + assert name == "logging" + monkeypatch.setattr( + "opentelemetry.instrumentation.logging.LoggingInstrumentor", + FakeLoggingInstrumentor, + ) + callback() + return True + + monkeypatch.setattr(bootstrap, "_LOGS_INITIALIZED", False) + monkeypatch.setattr(bootstrap, "_safe_instrument", fake_safe_instrument) + + bootstrap.init_logs() + + assert calls == [{"set_logging_format": True}] + class TestMetricHelpers: def test_all_metric_helpers_noop_safely_when_disabled(self) -> None: From 4c8d47617da5619c8ddcdc1a6f52136435d2fe4d Mon Sep 17 00:00:00 2001 From: Anish Sarkar <104695310+AnishSarkar22@users.noreply.github.com> Date: Sat, 23 May 2026 02:13:24 +0530 Subject: [PATCH 27/28] feat(env): add SURFSENSE_ENV variable for deployment environment and update observability resource attributes --- docker/.env.example | 5 ++++- surfsense_backend/.env.example | 5 ++++- surfsense_backend/app/observability/bootstrap.py | 13 ++++++------- .../tests/unit/observability/test_otel.py | 12 ++++++++++++ 4 files changed, 26 insertions(+), 9 deletions(-) diff --git a/docker/.env.example b/docker/.env.example index 2ff15e71d..748f03048 100644 --- a/docker/.env.example +++ b/docker/.env.example @@ -7,6 +7,9 @@ # SurfSense version (use "latest" or a specific version like "0.0.14") SURFSENSE_VERSION=latest +# Deployment environment: dev or production +SURFSENSE_ENV=production + # ------------------------------------------------------------------------------ # Core Settings # ------------------------------------------------------------------------------ @@ -309,7 +312,7 @@ STT_SERVICE=local/base # SURFSENSE_ENABLE_OTEL=true # OTEL_EXPORTER_OTLP_ENDPOINT=http://otel-collector:4317 # OTEL_EXPORTER_OTLP_PROTOCOL=grpc -# OTEL_RESOURCE_ATTRIBUTES=deployment.environment.name=production,service.namespace=surfsense +# OTEL_RESOURCE_ATTRIBUTES=service.namespace=surfsense # # Emergency kill switch. # OTEL_SDK_DISABLED=true diff --git a/surfsense_backend/.env.example b/surfsense_backend/.env.example index 91b03770e..b05369412 100644 --- a/surfsense_backend/.env.example +++ b/surfsense_backend/.env.example @@ -1,5 +1,8 @@ DATABASE_URL=postgresql+asyncpg://postgres:postgres@localhost:5432/surfsense +# Deployment environment: dev or production +SURFSENSE_ENV=dev + #Celery Config CELERY_BROKER_URL=redis://localhost:6379/0 CELERY_RESULT_BACKEND=redis://localhost:6379/0 @@ -310,7 +313,7 @@ LANGSMITH_PROJECT=surfsense # use http://otel-lgtm:4317 instead. # OTEL_EXPORTER_OTLP_ENDPOINT=http://otel-collector:4317 # OTEL_EXPORTER_OTLP_PROTOCOL=grpc # or http/protobuf -# OTEL_RESOURCE_ATTRIBUTES=deployment.environment.name=production,service.namespace=surfsense +# OTEL_RESOURCE_ATTRIBUTES=service.namespace=surfsense # OTEL_METRIC_EXPORT_INTERVAL=300000 # ms; 5 minutes # OTEL_SDK_DISABLED=true # emergency kill-switch diff --git a/surfsense_backend/app/observability/bootstrap.py b/surfsense_backend/app/observability/bootstrap.py index f9ed65e7b..70008d43d 100644 --- a/surfsense_backend/app/observability/bootstrap.py +++ b/surfsense_backend/app/observability/bootstrap.py @@ -55,23 +55,22 @@ def _package_version() -> str: def _deployment_environment() -> str: - return ( - os.environ.get("SURFSENSE_ENV") - or os.environ.get("APP_ENV") - or os.environ.get("ENVIRONMENT") - or "dev" - ) + return os.environ.get("SURFSENSE_ENV", "dev") def _build_resource(): from opentelemetry.sdk.resources import Resource + deployment_environment = _deployment_environment() return Resource.create( { "service.name": os.environ.get("OTEL_SERVICE_NAME", "surfsense-backend"), "service.version": _package_version(), "service.instance.id": socket.gethostname(), - "deployment.environment.name": _deployment_environment(), + "deployment.environment.name": deployment_environment, + # Compatibility alias for Grafana onboarding checks that still use + # the older semantic-convention key. + "deployment.environment": deployment_environment, } ) diff --git a/surfsense_backend/tests/unit/observability/test_otel.py b/surfsense_backend/tests/unit/observability/test_otel.py index 52ccba82f..d3718e7b9 100644 --- a/surfsense_backend/tests/unit/observability/test_otel.py +++ b/surfsense_backend/tests/unit/observability/test_otel.py @@ -114,8 +114,20 @@ class TestBootstrapConfig: attrs = dict(resource.attributes) assert attrs["service.name"] == "custom-backend" assert attrs["deployment.environment.name"] == "test" + assert attrs["deployment.environment"] == "test" assert attrs["service.instance.id"] + def test_deployment_environment_uses_surfsense_env_only( + self, monkeypatch: pytest.MonkeyPatch + ) -> None: + monkeypatch.delenv("SURFSENSE_ENV", raising=False) + + assert bootstrap._deployment_environment() == "dev" + + monkeypatch.setenv("SURFSENSE_ENV", "production") + + assert bootstrap._deployment_environment() == "production" + def test_shutdown_is_safe_without_providers(self) -> None: bootstrap.shutdown_otel() From 6302939a72a19f7e58f76420306a0a6278390c80 Mon Sep 17 00:00:00 2001 From: Anish Sarkar <104695310+AnishSarkar22@users.noreply.github.com> Date: Sat, 23 May 2026 03:18:25 +0530 Subject: [PATCH 28/28] feat(docs): add observability documentation --- surfsense_web/content/docs/meta.json | 1 + surfsense_web/content/docs/observability.mdx | 167 +++++++++++++++++++ surfsense_web/lib/source.ts | 2 + 3 files changed, 170 insertions(+) create mode 100644 surfsense_web/content/docs/observability.mdx diff --git a/surfsense_web/content/docs/meta.json b/surfsense_web/content/docs/meta.json index a0b6f8a1b..13b599118 100644 --- a/surfsense_web/content/docs/meta.json +++ b/surfsense_web/content/docs/meta.json @@ -13,6 +13,7 @@ "how-to", "---Developers---", "testing", + "observability", "code-of-conduct" ] } diff --git a/surfsense_web/content/docs/observability.mdx b/surfsense_web/content/docs/observability.mdx new file mode 100644 index 000000000..c6dfb7e3a --- /dev/null +++ b/surfsense_web/content/docs/observability.mdx @@ -0,0 +1,167 @@ +--- +title: Observability +description: Configure backend traces and metrics for SurfSense +icon: Radar +--- + +SurfSense uses OpenTelemetry for backend traces and metrics. Application logs +include trace and span IDs so you can correlate logs with traces, but logs stay +on the normal container stderr path. + +## Enable Locally + +The development compose file reads backend settings from +`surfsense_backend/.env`. Add these values there: + +```dotenv +SURFSENSE_ENABLE_OTEL=true +SURFSENSE_ENV=dev +OTEL_EXPORTER_OTLP_ENDPOINT=http://otel-lgtm:4317 +OTEL_EXPORTER_OTLP_PROTOCOL=grpc +OTEL_RESOURCE_ATTRIBUTES=service.namespace=surfsense +OTEL_METRIC_EXPORT_INTERVAL=300000 +``` + +Then start the development stack with the local LGTM backend: + +```bash +docker compose -f docker/docker-compose.dev.yml up --build +``` + +Grafana is exposed on `http://localhost:3001` by default. + +## Enable in Production Docker Compose + +Production Docker Compose reads backend and collector settings from +`docker/.env`. The API and Celery worker export telemetry to the bundled +collector at `otel-collector:4317`; the collector is the only service that uses +the Grafana Cloud credentials. + +Add these values to `docker/.env`: + +```dotenv +SURFSENSE_ENV=production +SURFSENSE_ENABLE_OTEL=true +OTEL_EXPORTER_OTLP_ENDPOINT=http://otel-collector:4317 +OTEL_EXPORTER_OTLP_PROTOCOL=grpc +OTEL_RESOURCE_ATTRIBUTES=service.namespace=surfsense +OTEL_METRIC_EXPORT_INTERVAL=300000 + +GRAFANA_CLOUD_OTLP_ENDPOINT=https://otlp-gateway-.grafana.net/otlp +GRAFANA_CLOUD_INSTANCE_ID= +GRAFANA_CLOUD_API_KEY= +``` + +Then start the stack: + +```bash +docker compose -f docker/docker-compose.yml --profile observability up -d +``` + +The collector receives OTLP on `otel-collector:4317`, scrubs sensitive span +attributes, applies the configured tail-sampling policy, batches exports, +retries failures, and forwards traces and metrics to Grafana Cloud over OTLP +HTTP. + +When deploying `surfsense_backend/Dockerfile` directly instead of production +compose, use the same split: SurfSense containers export to a collector, and +the collector owns the Grafana Cloud credentials. + +## Automatic Traces + +When OpenTelemetry is enabled, the backend instruments: + +- FastAPI inbound requests. +- SQLAlchemy queries from the main async engine and Celery task engine. +- Raw psycopg calls used by the LangGraph checkpointer. +- Redis commands. +- HTTPX outbound requests. +- Celery producer and worker execution. + +## Manual Spans + +SurfSense keeps project-specific spans behind `app.observability.otel`: + +- `model.call` +- `tool.call` +- `chat.request` +- `kb.search` +- `kb.persist` +- `connector.sync` +- `subagent.invoke` +- `etl.extract` +- `etl.parse` +- `etl.ocr` +- `etl.picture.describe` +- `etl.picture.ocr` +- `compaction.run` +- `permission.asked` +- `interrupt.raised` + +Keep span names and attributes low-cardinality. Do not attach user content, +prompts, document titles, file paths, user-specific URLs, secrets, or raw +queries as span attributes. + +## Metrics + +The OpenTelemetry instrumentors provide HTTP, HTTPX, and Celery runtime +metrics. SurfSense adds these project metrics from `app.observability.metrics`: + +- `surfsense.model.call.duration` +- `gen_ai.client.token.usage` +- `surfsense.tool.call.duration` +- `surfsense.tool.call.errors` +- `surfsense.chat.request.duration` +- `surfsense.chat.request.outcome` +- `surfsense.kb.search.duration` +- `surfsense.compaction.runs` +- `surfsense.permission.asks` +- `surfsense.interrupt.raised` +- `surfsense.indexing.document.duration` +- `surfsense.indexing.document.outcome` +- `surfsense.connector.sync.duration` +- `surfsense.connector.sync.outcome` +- `surfsense.subagent.invoke.duration` +- `surfsense.subagent.invoke.outcome` +- `surfsense.etl.extract.duration` +- `surfsense.etl.extract.outcome` +- `surfsense.celery.heartbeat.refreshes` +- `surfsense.celery.heartbeat.failures` +- `surfsense.celery.queue.latency` +- `surfsense.auth.failures` +- `surfsense.rate_limit.rejections` +- `surfsense.perf.elapsed_ms` + +Runtime gauges include process RSS, CPU utilization, threads, open file +descriptors, asyncio tasks, and CPython GC counters. + +## Logs + +`LoggingInstrumentor().instrument()` injects `otelTraceID` and `otelSpanID` into +standard Python `LogRecord`s. The root log format writes them as +`trace_id=... span_id=...`. + +SurfSense intentionally does not create an OpenTelemetry `LoggerProvider`, +`LoggingHandler`, or `OTLPLogExporter`. Container stderr remains the log +transport. + +## Verification + +1. Hit a FastAPI endpoint and confirm an inbound server span appears in Grafana. +2. Run a chat request and confirm `model.call` and `tool.call` child spans. +3. Run a knowledge-base search and confirm `kb.search` spans and SQL child spans. +4. Run connector indexing and confirm Celery producer/worker spans share a trace + ID and connector sync metrics increment. +5. Confirm `gen_ai.client.token.usage`, model/tool durations, request duration, + Celery runtime, and runtime gauges appear within one export interval. +6. Confirm logs emitted inside a traced request show non-zero trace and span IDs. + +## Out Of Scope + +- Frontend/browser OpenTelemetry. +- OpenTelemetry log export. +- Profiling. +- Production backend selection. +- Tail-sampling collector configuration. +- Replacing LangSmith. +- Vendor SDKs. diff --git a/surfsense_web/lib/source.ts b/surfsense_web/lib/source.ts index b94f990ab..62fbb362b 100644 --- a/surfsense_web/lib/source.ts +++ b/surfsense_web/lib/source.ts @@ -7,6 +7,7 @@ import { Download, FlaskConical, Heart, + Radar, Unplug, Wrench, } from "lucide-react"; @@ -26,6 +27,7 @@ const DOCS_ICONS: Record = { Download, FlaskConical, Heart, + Radar, Unplug, Wrench, };