From 48e646607ba935cc1bae26b15879de5f13938127 Mon Sep 17 00:00:00 2001 From: Manoj Aggarwal Date: Mon, 2 Feb 2026 12:07:53 -0800 Subject: [PATCH 1/2] Fix google calendar and notion erros --- .../app/tasks/celery_tasks/connector_tasks.py | 43 +++++++++++++++++++ .../app/tasks/connector_indexers/base.py | 14 ++++++ .../google_calendar_indexer.py | 14 ++++++ .../connector_indexers/notion_indexer.py | 43 ++++++++++++++++--- .../connector_indexers/webcrawler_indexer.py | 20 +++++++-- 5 files changed, 124 insertions(+), 10 deletions(-) diff --git a/surfsense_backend/app/tasks/celery_tasks/connector_tasks.py b/surfsense_backend/app/tasks/celery_tasks/connector_tasks.py index d0710d246..760651589 100644 --- a/surfsense_backend/app/tasks/celery_tasks/connector_tasks.py +++ b/surfsense_backend/app/tasks/celery_tasks/connector_tasks.py @@ -1,6 +1,7 @@ """Celery tasks for connector indexing.""" import logging +import traceback from sqlalchemy.ext.asyncio import async_sessionmaker, create_async_engine from sqlalchemy.pool import NullPool @@ -11,6 +12,36 @@ from app.config import config logger = logging.getLogger(__name__) +def _handle_greenlet_error(e: Exception, task_name: str, connector_id: int) -> None: + """ + Handle greenlet_spawn errors with detailed logging for debugging. + + The 'greenlet_spawn has not been called' error occurs when: + 1. SQLAlchemy lazy-loads a relationship outside of an async context + 2. A sync operation is called from an async context (or vice versa) + 3. Session objects are accessed after the session is closed + + This helper logs detailed context to help identify the root cause. + """ + error_str = str(e) + if "greenlet_spawn has not been called" in error_str: + logger.error( + f"GREENLET ERROR in {task_name} for connector {connector_id}: {error_str}\n" + f"This error typically occurs when SQLAlchemy tries to lazy-load a relationship " + f"outside of an async context. Check for:\n" + f"1. Accessing relationship attributes (e.g., document.chunks, connector.search_space) " + f"without using selectinload() or joinedload()\n" + f"2. Accessing model attributes after the session is closed\n" + f"3. Passing ORM objects between different async contexts\n" + f"Stack trace:\n{traceback.format_exc()}" + ) + else: + logger.error( + f"Error in {task_name} for connector {connector_id}: {error_str}\n" + f"Stack trace:\n{traceback.format_exc()}" + ) + + def get_celery_session_maker(): """ Create a new async session maker for Celery tasks. @@ -46,6 +77,9 @@ def index_slack_messages_task( connector_id, search_space_id, user_id, start_date, end_date ) ) + except Exception as e: + _handle_greenlet_error(e, "index_slack_messages", connector_id) + raise finally: loop.close() @@ -89,6 +123,9 @@ def index_notion_pages_task( connector_id, search_space_id, user_id, start_date, end_date ) ) + except Exception as e: + _handle_greenlet_error(e, "index_notion_pages", connector_id) + raise finally: loop.close() @@ -347,6 +384,9 @@ def index_google_calendar_events_task( connector_id, search_space_id, user_id, start_date, end_date ) ) + except Exception as e: + _handle_greenlet_error(e, "index_google_calendar_events", connector_id) + raise finally: loop.close() @@ -696,6 +736,9 @@ def index_crawled_urls_task( connector_id, search_space_id, user_id, start_date, end_date ) ) + except Exception as e: + _handle_greenlet_error(e, "index_crawled_urls", connector_id) + raise finally: loop.close() diff --git a/surfsense_backend/app/tasks/connector_indexers/base.py b/surfsense_backend/app/tasks/connector_indexers/base.py index b390937f0..311fda996 100644 --- a/surfsense_backend/app/tasks/connector_indexers/base.py +++ b/surfsense_backend/app/tasks/connector_indexers/base.py @@ -159,6 +159,20 @@ def calculate_date_range( ) end_date_str = end_date if end_date else calculated_end_date.strftime("%Y-%m-%d") + # FIX: Ensure end_date is at least 1 day after start_date to avoid + # "start_date must be strictly before end_date" errors when dates are the same + # (e.g., when last_indexed_at is today) + if start_date_str == end_date_str: + logger.info( + f"Start date ({start_date_str}) equals end date ({end_date_str}), " + "adjusting end date to next day to ensure valid date range" + ) + # Parse end_date and add 1 day + end_dt = datetime.strptime(end_date_str, "%Y-%m-%d") + end_dt = end_dt + timedelta(days=1) + end_date_str = end_dt.strftime("%Y-%m-%d") + logger.info(f"Adjusted end date to {end_date_str}") + return start_date_str, end_date_str diff --git a/surfsense_backend/app/tasks/connector_indexers/google_calendar_indexer.py b/surfsense_backend/app/tasks/connector_indexers/google_calendar_indexer.py index f64a7a5c3..1d8ea32f2 100644 --- a/surfsense_backend/app/tasks/connector_indexers/google_calendar_indexer.py +++ b/surfsense_backend/app/tasks/connector_indexers/google_calendar_indexer.py @@ -217,6 +217,20 @@ async def index_google_calendar_events( start_date_str = start_date end_date_str = end_date + # FIX: Ensure end_date is at least 1 day after start_date to avoid + # "start_date must be strictly before end_date" errors when dates are the same + # (e.g., when last_indexed_at is today) + if start_date_str == end_date_str: + logger.info( + f"Start date ({start_date_str}) equals end date ({end_date_str}), " + "adjusting end date to next day to ensure valid date range" + ) + # Parse end_date and add 1 day + end_dt = datetime.strptime(end_date_str, "%Y-%m-%d") + end_dt = end_dt + timedelta(days=1) + end_date_str = end_dt.strftime("%Y-%m-%d") + logger.info(f"Adjusted end date to {end_date_str}") + await task_logger.log_task_progress( log_entry, f"Fetching Google Calendar events from {start_date_str} to {end_date_str}", diff --git a/surfsense_backend/app/tasks/connector_indexers/notion_indexer.py b/surfsense_backend/app/tasks/connector_indexers/notion_indexer.py index 52622471a..ee5bca5d8 100644 --- a/surfsense_backend/app/tasks/connector_indexers/notion_indexer.py +++ b/surfsense_backend/app/tasks/connector_indexers/notion_indexer.py @@ -196,13 +196,44 @@ async def index_notion_pages( "Recommend reconnecting with OAuth." ) except Exception as e: - await task_logger.log_task_failure( - log_entry, - f"Failed to get Notion pages for connector {connector_id}", - str(e), - {"error_type": "PageFetchError"}, + error_str = str(e) + # Check if this is an unsupported block type error (transcription, ai_block, etc.) + # These are known Notion API limitations and should be logged as warnings, not errors + unsupported_block_errors = [ + "transcription is not supported", + "ai_block is not supported", + "is not supported via the API", + ] + is_unsupported_block_error = any( + err in error_str.lower() for err in unsupported_block_errors ) - logger.error(f"Error fetching Notion pages: {e!s}", exc_info=True) + + if is_unsupported_block_error: + # Log as warning since this is a known Notion API limitation + logger.warning( + f"Notion API limitation for connector {connector_id}: {error_str}. " + "This is a known issue with Notion AI blocks (transcription, ai_block) " + "that are not accessible via the Notion API." + ) + await task_logger.log_task_failure( + log_entry, + f"Failed to get Notion pages: Notion API limitation", + f"{error_str} - This page contains Notion AI content (transcription/ai_block) that cannot be accessed via the API.", + {"error_type": "UnsupportedBlockType", "is_known_limitation": True}, + ) + else: + # Log as error for other failures + logger.error( + f"Error fetching Notion pages for connector {connector_id}: {error_str}", + exc_info=True, + ) + await task_logger.log_task_failure( + log_entry, + f"Failed to get Notion pages for connector {connector_id}", + str(e), + {"error_type": "PageFetchError"}, + ) + await notion_client.close() return 0, f"Failed to get Notion pages: {e!s}" diff --git a/surfsense_backend/app/tasks/connector_indexers/webcrawler_indexer.py b/surfsense_backend/app/tasks/connector_indexers/webcrawler_indexer.py index ac16ecde6..b3c24a4e3 100644 --- a/surfsense_backend/app/tasks/connector_indexers/webcrawler_indexer.py +++ b/surfsense_backend/app/tasks/connector_indexers/webcrawler_indexer.py @@ -108,10 +108,15 @@ async def index_crawled_urls( api_key = connector.config.get("FIRECRAWL_API_KEY") # Get URLs from connector config - urls = parse_webcrawler_urls(connector.config.get("INITIAL_URLS")) + raw_initial_urls = connector.config.get("INITIAL_URLS") + urls = parse_webcrawler_urls(raw_initial_urls) + # DEBUG: Log connector config details for troubleshooting empty URL issues logger.info( - f"Starting crawled web page indexing for connector {connector_id} with {len(urls)} URLs" + f"Starting crawled web page indexing for connector {connector_id} with {len(urls)} URLs. " + f"Connector name: {connector.name}, " + f"INITIAL_URLS type: {type(raw_initial_urls).__name__}, " + f"INITIAL_URLS value: {repr(raw_initial_urls)[:200] if raw_initial_urls else 'None'}" ) # Initialize webcrawler client @@ -128,11 +133,18 @@ async def index_crawled_urls( # Validate URLs if not urls: + # DEBUG: Log detailed connector config for troubleshooting + logger.error( + f"No URLs provided for indexing. Connector ID: {connector_id}, " + f"Connector name: {connector.name}, " + f"Config keys: {list(connector.config.keys()) if connector.config else 'None'}, " + f"INITIAL_URLS raw value: {repr(raw_initial_urls)}" + ) await task_logger.log_task_failure( log_entry, "No URLs provided for indexing", - "Empty URL list", - {"error_type": "ValidationError"}, + f"Empty URL list. INITIAL_URLS value: {repr(raw_initial_urls)[:100]}", + {"error_type": "ValidationError", "connector_name": connector.name}, ) return 0, "No URLs provided for indexing" From 33165830e58820436bb27d915d9e81981ec2f7f0 Mon Sep 17 00:00:00 2001 From: Manoj Aggarwal Date: Wed, 4 Feb 2026 13:18:33 -0800 Subject: [PATCH 2/2] add parse date flexible --- .../app/tasks/connector_indexers/base.py | 36 ++++++++++++++++++- .../google_calendar_indexer.py | 9 ++++- 2 files changed, 43 insertions(+), 2 deletions(-) diff --git a/surfsense_backend/app/tasks/connector_indexers/base.py b/surfsense_backend/app/tasks/connector_indexers/base.py index 311fda996..b801b67d6 100644 --- a/surfsense_backend/app/tasks/connector_indexers/base.py +++ b/surfsense_backend/app/tasks/connector_indexers/base.py @@ -28,6 +28,34 @@ def get_current_timestamp() -> datetime: return datetime.now(UTC) +def parse_date_flexible(date_str: str) -> datetime: + """ + Parse date from multiple common formats. + + Args: + date_str: Date string to parse + + Returns: + Parsed datetime object + + Raises: + ValueError: If unable to parse the date string + """ + formats = ["%Y-%m-%d", "%Y-%m-%dT%H:%M:%S"] + + for fmt in formats: + try: + return datetime.strptime(date_str.rstrip("Z"), fmt) + except ValueError: + continue + + # Try ISO format as fallback + try: + return datetime.fromisoformat(date_str.replace("Z", "+00:00")) + except ValueError: + raise ValueError(f"Unable to parse date: {date_str}") + + async def check_duplicate_document_by_hash( session: AsyncSession, content_hash: str ) -> Document | None: @@ -168,7 +196,13 @@ def calculate_date_range( "adjusting end date to next day to ensure valid date range" ) # Parse end_date and add 1 day - end_dt = datetime.strptime(end_date_str, "%Y-%m-%d") + try: + end_dt = parse_date_flexible(end_date_str) + except ValueError: + logger.warning( + f"Could not parse end_date '{end_date_str}', using current date" + ) + end_dt = datetime.now() end_dt = end_dt + timedelta(days=1) end_date_str = end_dt.strftime("%Y-%m-%d") logger.info(f"Adjusted end date to {end_date_str}") diff --git a/surfsense_backend/app/tasks/connector_indexers/google_calendar_indexer.py b/surfsense_backend/app/tasks/connector_indexers/google_calendar_indexer.py index 1d8ea32f2..3e1a81356 100644 --- a/surfsense_backend/app/tasks/connector_indexers/google_calendar_indexer.py +++ b/surfsense_backend/app/tasks/connector_indexers/google_calendar_indexer.py @@ -27,6 +27,7 @@ from .base import ( get_connector_by_id, get_current_timestamp, logger, + parse_date_flexible, update_connector_last_indexed, ) @@ -226,7 +227,13 @@ async def index_google_calendar_events( "adjusting end date to next day to ensure valid date range" ) # Parse end_date and add 1 day - end_dt = datetime.strptime(end_date_str, "%Y-%m-%d") + try: + end_dt = parse_date_flexible(end_date_str) + except ValueError: + logger.warning( + f"Could not parse end_date '{end_date_str}', using current date" + ) + end_dt = datetime.now() end_dt = end_dt + timedelta(days=1) end_date_str = end_dt.strftime("%Y-%m-%d") logger.info(f"Adjusted end date to {end_date_str}")