mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-05-25 19:15:18 +02:00
feat(observability): add OpenTelemetry process bootstrap
This commit is contained in:
parent
60049936e3
commit
eb2e2b253b
7 changed files with 413 additions and 21 deletions
|
|
@ -1,4 +1,5 @@
|
|||
import asyncio
|
||||
import contextlib
|
||||
import gc
|
||||
import logging
|
||||
import time
|
||||
|
|
@ -36,13 +37,15 @@ from app.config import (
|
|||
)
|
||||
from app.db import User, create_db_and_tables, get_async_session
|
||||
from app.exceptions import GENERIC_5XX_MESSAGE, ISSUES_URL, SurfSenseError
|
||||
from app.observability import metrics as ot_metrics
|
||||
from app.observability.bootstrap import init_otel, shutdown_otel
|
||||
from app.rate_limiter import get_real_client_ip, limiter
|
||||
from app.routes import router as crud_router
|
||||
from app.routes.auth_routes import router as auth_router
|
||||
from app.schemas import UserCreate, UserRead, UserUpdate
|
||||
from app.tasks.surfsense_docs_indexer import seed_surfsense_docs
|
||||
from app.users import SECRET, auth_backend, current_active_user, fastapi_users
|
||||
from app.utils.perf import get_perf_logger, log_system_snapshot
|
||||
from app.utils.perf import log_system_snapshot
|
||||
|
||||
_error_logger = logging.getLogger("surfsense.errors")
|
||||
|
||||
|
|
@ -127,6 +130,8 @@ def _http_exception_handler(request: Request, exc: HTTPException) -> JSONRespons
|
|||
logged server-side.
|
||||
"""
|
||||
rid = _get_request_id(request)
|
||||
if exc.status_code in {401, 403} and request.url.path.startswith("/auth"):
|
||||
ot_metrics.record_auth_failure(reason=_status_to_code(exc.status_code))
|
||||
should_sanitize = exc.status_code == 500
|
||||
|
||||
# Structured dict details (e.g. {"code": "CAPTCHA_REQUIRED", "message": "..."})
|
||||
|
|
@ -213,6 +218,7 @@ def _validation_error_handler(
|
|||
def _unhandled_exception_handler(request: Request, exc: Exception) -> JSONResponse:
|
||||
"""Catch-all: log full traceback, return sanitized 500."""
|
||||
rid = _get_request_id(request)
|
||||
ot_metrics.record_auth_failure(reason="unhandled_exception")
|
||||
_error_logger.error(
|
||||
"[%s] Unhandled exception on %s %s",
|
||||
rid,
|
||||
|
|
@ -246,6 +252,7 @@ def _status_to_code(status_code: int, detail: str = "") -> str:
|
|||
def _rate_limit_exceeded_handler(request: Request, exc: RateLimitExceeded):
|
||||
"""Custom 429 handler that returns JSON matching our error envelope."""
|
||||
rid = _get_request_id(request)
|
||||
ot_metrics.record_rate_limit_rejection(scope="slowapi")
|
||||
retry_after = exc.detail.split("per")[-1].strip() if exc.detail else "60"
|
||||
return _build_error_response(
|
||||
429,
|
||||
|
|
@ -306,6 +313,7 @@ def _check_rate_limit_memory(
|
|||
f"Rate limit exceeded (in-memory fallback) on {scope} for IP {client_ip} "
|
||||
f"({len(timestamps)}/{max_requests} in {window_seconds}s)"
|
||||
)
|
||||
ot_metrics.record_rate_limit_rejection(scope=scope)
|
||||
raise HTTPException(
|
||||
status_code=429,
|
||||
detail="RATE_LIMIT_EXCEEDED",
|
||||
|
|
@ -349,6 +357,7 @@ def _check_rate_limit(
|
|||
f"Rate limit exceeded on {scope} for IP {client_ip} "
|
||||
f"({current_count}/{max_requests} in {window_seconds}s)"
|
||||
)
|
||||
ot_metrics.record_rate_limit_rejection(scope=scope)
|
||||
raise HTTPException(
|
||||
status_code=429,
|
||||
detail="RATE_LIMIT_EXCEEDED",
|
||||
|
|
@ -558,6 +567,7 @@ async def lifespan(app: FastAPI):
|
|||
gc.set_threshold(700, 10, 5)
|
||||
|
||||
_enable_slow_callback_logging(threshold_sec=0.5)
|
||||
init_otel(app)
|
||||
await create_db_and_tables()
|
||||
await setup_checkpointer_tables()
|
||||
initialize_openrouter_integration()
|
||||
|
|
@ -592,6 +602,7 @@ async def lifespan(app: FastAPI):
|
|||
|
||||
_stop_openrouter_background_refresh()
|
||||
await close_checkpointer()
|
||||
shutdown_otel()
|
||||
|
||||
|
||||
def registration_allowed():
|
||||
|
|
@ -676,32 +687,20 @@ class RequestPerfMiddleware(BaseHTTPMiddleware):
|
|||
async def dispatch(
|
||||
self, request: StarletteRequest, call_next: RequestResponseEndpoint
|
||||
) -> StarletteResponse:
|
||||
perf = get_perf_logger()
|
||||
t0 = time.perf_counter()
|
||||
response = await call_next(request)
|
||||
elapsed_ms = (time.perf_counter() - t0) * 1000
|
||||
|
||||
path = request.url.path
|
||||
method = request.method
|
||||
status = response.status_code
|
||||
|
||||
perf.debug(
|
||||
"[request] %s %s -> %d in %.1fms",
|
||||
method,
|
||||
path,
|
||||
status,
|
||||
elapsed_ms,
|
||||
)
|
||||
|
||||
if elapsed_ms > _PERF_SLOW_REQUEST_THRESHOLD:
|
||||
perf.warning(
|
||||
"[SLOW_REQUEST] %s %s -> %d in %.1fms (threshold=%.0fms)",
|
||||
method,
|
||||
path,
|
||||
status,
|
||||
elapsed_ms,
|
||||
_PERF_SLOW_REQUEST_THRESHOLD,
|
||||
)
|
||||
with contextlib.suppress(Exception):
|
||||
from opentelemetry import trace
|
||||
|
||||
span = trace.get_current_span()
|
||||
span.set_attribute("slow_request", True)
|
||||
span.set_attribute("surfsense.request.elapsed_ms", elapsed_ms)
|
||||
span.set_attribute("http.route", path)
|
||||
log_system_snapshot("slow_request")
|
||||
|
||||
return response
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue