Fix google calendar and notion erros

This commit is contained in:
Manoj Aggarwal 2026-02-02 12:07:53 -08:00
parent 8fb5a7fb8f
commit 48e646607b
5 changed files with 124 additions and 10 deletions

View file

@ -1,6 +1,7 @@
"""Celery tasks for connector indexing."""
import logging
import traceback
from sqlalchemy.ext.asyncio import async_sessionmaker, create_async_engine
from sqlalchemy.pool import NullPool
@ -11,6 +12,36 @@ from app.config import config
logger = logging.getLogger(__name__)
def _handle_greenlet_error(e: Exception, task_name: str, connector_id: int) -> None:
"""
Handle greenlet_spawn errors with detailed logging for debugging.
The 'greenlet_spawn has not been called' error occurs when:
1. SQLAlchemy lazy-loads a relationship outside of an async context
2. A sync operation is called from an async context (or vice versa)
3. Session objects are accessed after the session is closed
This helper logs detailed context to help identify the root cause.
"""
error_str = str(e)
if "greenlet_spawn has not been called" in error_str:
logger.error(
f"GREENLET ERROR in {task_name} for connector {connector_id}: {error_str}\n"
f"This error typically occurs when SQLAlchemy tries to lazy-load a relationship "
f"outside of an async context. Check for:\n"
f"1. Accessing relationship attributes (e.g., document.chunks, connector.search_space) "
f"without using selectinload() or joinedload()\n"
f"2. Accessing model attributes after the session is closed\n"
f"3. Passing ORM objects between different async contexts\n"
f"Stack trace:\n{traceback.format_exc()}"
)
else:
logger.error(
f"Error in {task_name} for connector {connector_id}: {error_str}\n"
f"Stack trace:\n{traceback.format_exc()}"
)
def get_celery_session_maker():
"""
Create a new async session maker for Celery tasks.
@ -46,6 +77,9 @@ def index_slack_messages_task(
connector_id, search_space_id, user_id, start_date, end_date
)
)
except Exception as e:
_handle_greenlet_error(e, "index_slack_messages", connector_id)
raise
finally:
loop.close()
@ -89,6 +123,9 @@ def index_notion_pages_task(
connector_id, search_space_id, user_id, start_date, end_date
)
)
except Exception as e:
_handle_greenlet_error(e, "index_notion_pages", connector_id)
raise
finally:
loop.close()
@ -347,6 +384,9 @@ def index_google_calendar_events_task(
connector_id, search_space_id, user_id, start_date, end_date
)
)
except Exception as e:
_handle_greenlet_error(e, "index_google_calendar_events", connector_id)
raise
finally:
loop.close()
@ -696,6 +736,9 @@ def index_crawled_urls_task(
connector_id, search_space_id, user_id, start_date, end_date
)
)
except Exception as e:
_handle_greenlet_error(e, "index_crawled_urls", connector_id)
raise
finally:
loop.close()

View file

@ -159,6 +159,20 @@ def calculate_date_range(
)
end_date_str = end_date if end_date else calculated_end_date.strftime("%Y-%m-%d")
# FIX: Ensure end_date is at least 1 day after start_date to avoid
# "start_date must be strictly before end_date" errors when dates are the same
# (e.g., when last_indexed_at is today)
if start_date_str == end_date_str:
logger.info(
f"Start date ({start_date_str}) equals end date ({end_date_str}), "
"adjusting end date to next day to ensure valid date range"
)
# Parse end_date and add 1 day
end_dt = datetime.strptime(end_date_str, "%Y-%m-%d")
end_dt = end_dt + timedelta(days=1)
end_date_str = end_dt.strftime("%Y-%m-%d")
logger.info(f"Adjusted end date to {end_date_str}")
return start_date_str, end_date_str

View file

@ -217,6 +217,20 @@ async def index_google_calendar_events(
start_date_str = start_date
end_date_str = end_date
# FIX: Ensure end_date is at least 1 day after start_date to avoid
# "start_date must be strictly before end_date" errors when dates are the same
# (e.g., when last_indexed_at is today)
if start_date_str == end_date_str:
logger.info(
f"Start date ({start_date_str}) equals end date ({end_date_str}), "
"adjusting end date to next day to ensure valid date range"
)
# Parse end_date and add 1 day
end_dt = datetime.strptime(end_date_str, "%Y-%m-%d")
end_dt = end_dt + timedelta(days=1)
end_date_str = end_dt.strftime("%Y-%m-%d")
logger.info(f"Adjusted end date to {end_date_str}")
await task_logger.log_task_progress(
log_entry,
f"Fetching Google Calendar events from {start_date_str} to {end_date_str}",

View file

@ -196,13 +196,44 @@ async def index_notion_pages(
"Recommend reconnecting with OAuth."
)
except Exception as e:
await task_logger.log_task_failure(
log_entry,
f"Failed to get Notion pages for connector {connector_id}",
str(e),
{"error_type": "PageFetchError"},
error_str = str(e)
# Check if this is an unsupported block type error (transcription, ai_block, etc.)
# These are known Notion API limitations and should be logged as warnings, not errors
unsupported_block_errors = [
"transcription is not supported",
"ai_block is not supported",
"is not supported via the API",
]
is_unsupported_block_error = any(
err in error_str.lower() for err in unsupported_block_errors
)
logger.error(f"Error fetching Notion pages: {e!s}", exc_info=True)
if is_unsupported_block_error:
# Log as warning since this is a known Notion API limitation
logger.warning(
f"Notion API limitation for connector {connector_id}: {error_str}. "
"This is a known issue with Notion AI blocks (transcription, ai_block) "
"that are not accessible via the Notion API."
)
await task_logger.log_task_failure(
log_entry,
f"Failed to get Notion pages: Notion API limitation",
f"{error_str} - This page contains Notion AI content (transcription/ai_block) that cannot be accessed via the API.",
{"error_type": "UnsupportedBlockType", "is_known_limitation": True},
)
else:
# Log as error for other failures
logger.error(
f"Error fetching Notion pages for connector {connector_id}: {error_str}",
exc_info=True,
)
await task_logger.log_task_failure(
log_entry,
f"Failed to get Notion pages for connector {connector_id}",
str(e),
{"error_type": "PageFetchError"},
)
await notion_client.close()
return 0, f"Failed to get Notion pages: {e!s}"

View file

@ -108,10 +108,15 @@ async def index_crawled_urls(
api_key = connector.config.get("FIRECRAWL_API_KEY")
# Get URLs from connector config
urls = parse_webcrawler_urls(connector.config.get("INITIAL_URLS"))
raw_initial_urls = connector.config.get("INITIAL_URLS")
urls = parse_webcrawler_urls(raw_initial_urls)
# DEBUG: Log connector config details for troubleshooting empty URL issues
logger.info(
f"Starting crawled web page indexing for connector {connector_id} with {len(urls)} URLs"
f"Starting crawled web page indexing for connector {connector_id} with {len(urls)} URLs. "
f"Connector name: {connector.name}, "
f"INITIAL_URLS type: {type(raw_initial_urls).__name__}, "
f"INITIAL_URLS value: {repr(raw_initial_urls)[:200] if raw_initial_urls else 'None'}"
)
# Initialize webcrawler client
@ -128,11 +133,18 @@ async def index_crawled_urls(
# Validate URLs
if not urls:
# DEBUG: Log detailed connector config for troubleshooting
logger.error(
f"No URLs provided for indexing. Connector ID: {connector_id}, "
f"Connector name: {connector.name}, "
f"Config keys: {list(connector.config.keys()) if connector.config else 'None'}, "
f"INITIAL_URLS raw value: {repr(raw_initial_urls)}"
)
await task_logger.log_task_failure(
log_entry,
"No URLs provided for indexing",
"Empty URL list",
{"error_type": "ValidationError"},
f"Empty URL list. INITIAL_URLS value: {repr(raw_initial_urls)[:100]}",
{"error_type": "ValidationError", "connector_name": connector.name},
)
return 0, "No URLs provided for indexing"