feat(connectors): add retry and auth telemetry events

This commit is contained in:
Anish Sarkar 2026-05-22 17:50:02 +05:30
parent c4abbd6e20
commit 7a3b278b75
2 changed files with 32 additions and 6 deletions

View file

@ -43,7 +43,7 @@ from app.db import (
async_session_maker, async_session_maker,
get_async_session, get_async_session,
) )
from app.observability import metrics as ot_metrics from app.observability import metrics as ot_metrics, otel as ot
from app.schemas import ( from app.schemas import (
GoogleDriveIndexRequest, GoogleDriveIndexRequest,
MCPConnectorCreate, MCPConnectorCreate,
@ -1246,6 +1246,12 @@ async def _persist_auth_expired(session: AsyncSession, connector_id: int) -> Non
"""Flag a connector as auth_expired so the frontend shows a re-auth prompt.""" """Flag a connector as auth_expired so the frontend shows a re-auth prompt."""
from sqlalchemy.orm.attributes import flag_modified from sqlalchemy.orm.attributes import flag_modified
ot.add_event(
"connector.auth.expired",
{
"error.category": "auth_failed",
},
)
try: try:
result = await session.execute( result = await session.execute(
select(SearchSourceConnector).where( select(SearchSourceConnector).where(
@ -1305,6 +1311,13 @@ async def _run_indexing_with_notifications(
try: try:
connector_lock_acquired = acquire_connector_indexing_lock(connector_id) connector_lock_acquired = acquire_connector_indexing_lock(connector_id)
if not connector_lock_acquired: if not connector_lock_acquired:
ot.add_event(
"connector.sync.skipped",
{
"skip.reason": "lock_contention",
"error.category": "lock_contention",
},
)
logger.info( logger.info(
f"Skipping indexing for connector {connector_id} " f"Skipping indexing for connector {connector_id} "
"(another worker already holds Redis connector lock)" "(another worker already holds Redis connector lock)"
@ -1375,6 +1388,15 @@ async def _run_indexing_with_notifications(
) -> None: ) -> None:
"""Callback to update notification during API retries (rate limits, etc.)""" """Callback to update notification during API retries (rate limits, etc.)"""
nonlocal notification nonlocal notification
ot.add_event(
"connector.retry.scheduled",
{
"retry.reason": retry_reason,
"retry.attempt": attempt,
"retry.max": max_attempts,
"retry.delay_ms": int(wait_seconds * 1000),
},
)
if notification: if notification:
try: try:
await session.refresh(notification) await session.refresh(notification)

View file

@ -22,15 +22,18 @@ def run_async_celery_task[T](coro_factory: Callable[[], Awaitable[T]]) -> T:
task_name = getattr(current_task, "name", None) or "unknown" task_name = getattr(current_task, "name", None) or "unknown"
t0 = time.perf_counter() t0 = time.perf_counter()
status = "failed" status = "failed"
error_category: str | None = None
try: try:
with ot.connector_sync_span(connector_type=task_name) as sp: with ot.connector_sync_span(connector_type=task_name) as sp:
result = _run_async_celery_task(coro_factory) try:
sp.set_attribute("connector.status", "success") result = _run_async_celery_task(coro_factory)
sp.set_attribute("connector.status", "success")
except Exception as exc:
error_category = ot_metrics.categorize_exception(exc)
sp.set_attribute("connector.error.category", error_category)
raise
status = "success" status = "success"
return result return result
except Exception:
status = "failed"
raise
finally: finally:
elapsed_s = time.perf_counter() - t0 elapsed_s = time.perf_counter() - t0
ot_metrics.record_connector_sync_duration( ot_metrics.record_connector_sync_duration(
@ -40,6 +43,7 @@ def run_async_celery_task[T](coro_factory: Callable[[], Awaitable[T]]) -> T:
ot_metrics.record_connector_sync_outcome( ot_metrics.record_connector_sync_outcome(
connector_type=task_name, connector_type=task_name,
status=status, status=status,
error_category=error_category,
) )