From 48e646607ba935cc1bae26b15879de5f13938127 Mon Sep 17 00:00:00 2001
From: Manoj Aggarwal <manoj6543@gmail.com>
Date: Mon, 2 Feb 2026 12:07:53 -0800
Subject: [PATCH 1/2] Fix google calendar and notion erros

---
 .../app/tasks/celery_tasks/connector_tasks.py | 43 +++++++++++++++++++
 .../app/tasks/connector_indexers/base.py      | 14 ++++++
 .../google_calendar_indexer.py                | 14 ++++++
 .../connector_indexers/notion_indexer.py      | 43 ++++++++++++++++---
 .../connector_indexers/webcrawler_indexer.py  | 20 +++++++--
 5 files changed, 124 insertions(+), 10 deletions(-)

diff --git a/surfsense_backend/app/tasks/celery_tasks/connector_tasks.py b/surfsense_backend/app/tasks/celery_tasks/connector_tasks.py
index d0710d246..760651589 100644
--- a/surfsense_backend/app/tasks/celery_tasks/connector_tasks.py
+++ b/surfsense_backend/app/tasks/celery_tasks/connector_tasks.py
@@ -1,6 +1,7 @@
 """Celery tasks for connector indexing."""
 
 import logging
+import traceback
 
 from sqlalchemy.ext.asyncio import async_sessionmaker, create_async_engine
 from sqlalchemy.pool import NullPool
@@ -11,6 +12,36 @@ from app.config import config
 logger = logging.getLogger(__name__)
 
 
+def _handle_greenlet_error(e: Exception, task_name: str, connector_id: int) -> None:
+    """
+    Handle greenlet_spawn errors with detailed logging for debugging.
+    
+    The 'greenlet_spawn has not been called' error occurs when:
+    1. SQLAlchemy lazy-loads a relationship outside of an async context
+    2. A sync operation is called from an async context (or vice versa)
+    3. Session objects are accessed after the session is closed
+    
+    This helper logs detailed context to help identify the root cause.
+    """
+    error_str = str(e)
+    if "greenlet_spawn has not been called" in error_str:
+        logger.error(
+            f"GREENLET ERROR in {task_name} for connector {connector_id}: {error_str}\n"
+            f"This error typically occurs when SQLAlchemy tries to lazy-load a relationship "
+            f"outside of an async context. Check for:\n"
+            f"1. Accessing relationship attributes (e.g., document.chunks, connector.search_space) "
+            f"without using selectinload() or joinedload()\n"
+            f"2. Accessing model attributes after the session is closed\n"
+            f"3. Passing ORM objects between different async contexts\n"
+            f"Stack trace:\n{traceback.format_exc()}"
+        )
+    else:
+        logger.error(
+            f"Error in {task_name} for connector {connector_id}: {error_str}\n"
+            f"Stack trace:\n{traceback.format_exc()}"
+        )
+
+
 def get_celery_session_maker():
     """
     Create a new async session maker for Celery tasks.
@@ -46,6 +77,9 @@ def index_slack_messages_task(
                 connector_id, search_space_id, user_id, start_date, end_date
             )
         )
+    except Exception as e:
+        _handle_greenlet_error(e, "index_slack_messages", connector_id)
+        raise
     finally:
         loop.close()
 
@@ -89,6 +123,9 @@ def index_notion_pages_task(
                 connector_id, search_space_id, user_id, start_date, end_date
             )
         )
+    except Exception as e:
+        _handle_greenlet_error(e, "index_notion_pages", connector_id)
+        raise
     finally:
         loop.close()
 
@@ -347,6 +384,9 @@ def index_google_calendar_events_task(
                 connector_id, search_space_id, user_id, start_date, end_date
             )
         )
+    except Exception as e:
+        _handle_greenlet_error(e, "index_google_calendar_events", connector_id)
+        raise
     finally:
         loop.close()
 
@@ -696,6 +736,9 @@ def index_crawled_urls_task(
                 connector_id, search_space_id, user_id, start_date, end_date
             )
         )
+    except Exception as e:
+        _handle_greenlet_error(e, "index_crawled_urls", connector_id)
+        raise
     finally:
         loop.close()
 
diff --git a/surfsense_backend/app/tasks/connector_indexers/base.py b/surfsense_backend/app/tasks/connector_indexers/base.py
index b390937f0..311fda996 100644
--- a/surfsense_backend/app/tasks/connector_indexers/base.py
+++ b/surfsense_backend/app/tasks/connector_indexers/base.py
@@ -159,6 +159,20 @@ def calculate_date_range(
     )
     end_date_str = end_date if end_date else calculated_end_date.strftime("%Y-%m-%d")
 
+    # FIX: Ensure end_date is at least 1 day after start_date to avoid
+    # "start_date must be strictly before end_date" errors when dates are the same
+    # (e.g., when last_indexed_at is today)
+    if start_date_str == end_date_str:
+        logger.info(
+            f"Start date ({start_date_str}) equals end date ({end_date_str}), "
+            "adjusting end date to next day to ensure valid date range"
+        )
+        # Parse end_date and add 1 day
+        end_dt = datetime.strptime(end_date_str, "%Y-%m-%d")
+        end_dt = end_dt + timedelta(days=1)
+        end_date_str = end_dt.strftime("%Y-%m-%d")
+        logger.info(f"Adjusted end date to {end_date_str}")
+
     return start_date_str, end_date_str
 
 
diff --git a/surfsense_backend/app/tasks/connector_indexers/google_calendar_indexer.py b/surfsense_backend/app/tasks/connector_indexers/google_calendar_indexer.py
index f64a7a5c3..1d8ea32f2 100644
--- a/surfsense_backend/app/tasks/connector_indexers/google_calendar_indexer.py
+++ b/surfsense_backend/app/tasks/connector_indexers/google_calendar_indexer.py
@@ -217,6 +217,20 @@ async def index_google_calendar_events(
             start_date_str = start_date
             end_date_str = end_date
 
+        # FIX: Ensure end_date is at least 1 day after start_date to avoid
+        # "start_date must be strictly before end_date" errors when dates are the same
+        # (e.g., when last_indexed_at is today)
+        if start_date_str == end_date_str:
+            logger.info(
+                f"Start date ({start_date_str}) equals end date ({end_date_str}), "
+                "adjusting end date to next day to ensure valid date range"
+            )
+            # Parse end_date and add 1 day
+            end_dt = datetime.strptime(end_date_str, "%Y-%m-%d")
+            end_dt = end_dt + timedelta(days=1)
+            end_date_str = end_dt.strftime("%Y-%m-%d")
+            logger.info(f"Adjusted end date to {end_date_str}")
+
         await task_logger.log_task_progress(
             log_entry,
             f"Fetching Google Calendar events from {start_date_str} to {end_date_str}",
diff --git a/surfsense_backend/app/tasks/connector_indexers/notion_indexer.py b/surfsense_backend/app/tasks/connector_indexers/notion_indexer.py
index 52622471a..ee5bca5d8 100644
--- a/surfsense_backend/app/tasks/connector_indexers/notion_indexer.py
+++ b/surfsense_backend/app/tasks/connector_indexers/notion_indexer.py
@@ -196,13 +196,44 @@ async def index_notion_pages(
                     "Recommend reconnecting with OAuth."
                 )
         except Exception as e:
-            await task_logger.log_task_failure(
-                log_entry,
-                f"Failed to get Notion pages for connector {connector_id}",
-                str(e),
-                {"error_type": "PageFetchError"},
+            error_str = str(e)
+            # Check if this is an unsupported block type error (transcription, ai_block, etc.)
+            # These are known Notion API limitations and should be logged as warnings, not errors
+            unsupported_block_errors = [
+                "transcription is not supported",
+                "ai_block is not supported",
+                "is not supported via the API",
+            ]
+            is_unsupported_block_error = any(
+                err in error_str.lower() for err in unsupported_block_errors
             )
-            logger.error(f"Error fetching Notion pages: {e!s}", exc_info=True)
+
+            if is_unsupported_block_error:
+                # Log as warning since this is a known Notion API limitation
+                logger.warning(
+                    f"Notion API limitation for connector {connector_id}: {error_str}. "
+                    "This is a known issue with Notion AI blocks (transcription, ai_block) "
+                    "that are not accessible via the Notion API."
+                )
+                await task_logger.log_task_failure(
+                    log_entry,
+                    f"Failed to get Notion pages: Notion API limitation",
+                    f"{error_str} - This page contains Notion AI content (transcription/ai_block) that cannot be accessed via the API.",
+                    {"error_type": "UnsupportedBlockType", "is_known_limitation": True},
+                )
+            else:
+                # Log as error for other failures
+                logger.error(
+                    f"Error fetching Notion pages for connector {connector_id}: {error_str}",
+                    exc_info=True,
+                )
+                await task_logger.log_task_failure(
+                    log_entry,
+                    f"Failed to get Notion pages for connector {connector_id}",
+                    str(e),
+                    {"error_type": "PageFetchError"},
+                )
+
             await notion_client.close()
             return 0, f"Failed to get Notion pages: {e!s}"
 
diff --git a/surfsense_backend/app/tasks/connector_indexers/webcrawler_indexer.py b/surfsense_backend/app/tasks/connector_indexers/webcrawler_indexer.py
index ac16ecde6..b3c24a4e3 100644
--- a/surfsense_backend/app/tasks/connector_indexers/webcrawler_indexer.py
+++ b/surfsense_backend/app/tasks/connector_indexers/webcrawler_indexer.py
@@ -108,10 +108,15 @@ async def index_crawled_urls(
         api_key = connector.config.get("FIRECRAWL_API_KEY")
 
         # Get URLs from connector config
-        urls = parse_webcrawler_urls(connector.config.get("INITIAL_URLS"))
+        raw_initial_urls = connector.config.get("INITIAL_URLS")
+        urls = parse_webcrawler_urls(raw_initial_urls)
 
+        # DEBUG: Log connector config details for troubleshooting empty URL issues
         logger.info(
-            f"Starting crawled web page indexing for connector {connector_id} with {len(urls)} URLs"
+            f"Starting crawled web page indexing for connector {connector_id} with {len(urls)} URLs. "
+            f"Connector name: {connector.name}, "
+            f"INITIAL_URLS type: {type(raw_initial_urls).__name__}, "
+            f"INITIAL_URLS value: {repr(raw_initial_urls)[:200] if raw_initial_urls else 'None'}"
         )
 
         # Initialize webcrawler client
@@ -128,11 +133,18 @@ async def index_crawled_urls(
 
         # Validate URLs
         if not urls:
+            # DEBUG: Log detailed connector config for troubleshooting
+            logger.error(
+                f"No URLs provided for indexing. Connector ID: {connector_id}, "
+                f"Connector name: {connector.name}, "
+                f"Config keys: {list(connector.config.keys()) if connector.config else 'None'}, "
+                f"INITIAL_URLS raw value: {repr(raw_initial_urls)}"
+            )
             await task_logger.log_task_failure(
                 log_entry,
                 "No URLs provided for indexing",
-                "Empty URL list",
-                {"error_type": "ValidationError"},
+                f"Empty URL list. INITIAL_URLS value: {repr(raw_initial_urls)[:100]}",
+                {"error_type": "ValidationError", "connector_name": connector.name},
             )
             return 0, "No URLs provided for indexing"
 

From 33165830e58820436bb27d915d9e81981ec2f7f0 Mon Sep 17 00:00:00 2001
From: Manoj Aggarwal <manoj6543@gmail.com>
Date: Wed, 4 Feb 2026 13:18:33 -0800
Subject: [PATCH 2/2] add parse date flexible

---
 .../app/tasks/connector_indexers/base.py      | 36 ++++++++++++++++++-
 .../google_calendar_indexer.py                |  9 ++++-
 2 files changed, 43 insertions(+), 2 deletions(-)

diff --git a/surfsense_backend/app/tasks/connector_indexers/base.py b/surfsense_backend/app/tasks/connector_indexers/base.py
index 311fda996..b801b67d6 100644
--- a/surfsense_backend/app/tasks/connector_indexers/base.py
+++ b/surfsense_backend/app/tasks/connector_indexers/base.py
@@ -28,6 +28,34 @@ def get_current_timestamp() -> datetime:
     return datetime.now(UTC)
 
 
+def parse_date_flexible(date_str: str) -> datetime:
+    """
+    Parse date from multiple common formats.
+
+    Args:
+        date_str: Date string to parse
+
+    Returns:
+        Parsed datetime object
+
+    Raises:
+        ValueError: If unable to parse the date string
+    """
+    formats = ["%Y-%m-%d", "%Y-%m-%dT%H:%M:%S"]
+
+    for fmt in formats:
+        try:
+            return datetime.strptime(date_str.rstrip("Z"), fmt)
+        except ValueError:
+            continue
+
+    # Try ISO format as fallback
+    try:
+        return datetime.fromisoformat(date_str.replace("Z", "+00:00"))
+    except ValueError:
+        raise ValueError(f"Unable to parse date: {date_str}")
+
+
 async def check_duplicate_document_by_hash(
     session: AsyncSession, content_hash: str
 ) -> Document | None:
@@ -168,7 +196,13 @@ def calculate_date_range(
             "adjusting end date to next day to ensure valid date range"
         )
         # Parse end_date and add 1 day
-        end_dt = datetime.strptime(end_date_str, "%Y-%m-%d")
+        try:
+            end_dt = parse_date_flexible(end_date_str)
+        except ValueError:
+            logger.warning(
+                f"Could not parse end_date '{end_date_str}', using current date"
+            )
+            end_dt = datetime.now()
         end_dt = end_dt + timedelta(days=1)
         end_date_str = end_dt.strftime("%Y-%m-%d")
         logger.info(f"Adjusted end date to {end_date_str}")
diff --git a/surfsense_backend/app/tasks/connector_indexers/google_calendar_indexer.py b/surfsense_backend/app/tasks/connector_indexers/google_calendar_indexer.py
index 1d8ea32f2..3e1a81356 100644
--- a/surfsense_backend/app/tasks/connector_indexers/google_calendar_indexer.py
+++ b/surfsense_backend/app/tasks/connector_indexers/google_calendar_indexer.py
@@ -27,6 +27,7 @@ from .base import (
     get_connector_by_id,
     get_current_timestamp,
     logger,
+    parse_date_flexible,
     update_connector_last_indexed,
 )
 
@@ -226,7 +227,13 @@ async def index_google_calendar_events(
                 "adjusting end date to next day to ensure valid date range"
             )
             # Parse end_date and add 1 day
-            end_dt = datetime.strptime(end_date_str, "%Y-%m-%d")
+            try:
+                end_dt = parse_date_flexible(end_date_str)
+            except ValueError:
+                logger.warning(
+                    f"Could not parse end_date '{end_date_str}', using current date"
+                )
+                end_dt = datetime.now()
             end_dt = end_dt + timedelta(days=1)
             end_date_str = end_dt.strftime("%Y-%m-%d")
             logger.info(f"Adjusted end date to {end_date_str}")