mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-04-26 09:16:22 +02:00
Merge pull request #785 from manojag115/bugs_prod
Fixing google calendar and notion errors + better logging for debugging
This commit is contained in:
commit
074e11be2c
5 changed files with 165 additions and 10 deletions
|
|
@ -1,6 +1,7 @@
|
|||
"""Celery tasks for connector indexing."""
|
||||
|
||||
import logging
|
||||
import traceback
|
||||
|
||||
from sqlalchemy.ext.asyncio import async_sessionmaker, create_async_engine
|
||||
from sqlalchemy.pool import NullPool
|
||||
|
|
@ -11,6 +12,36 @@ from app.config import config
|
|||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _handle_greenlet_error(e: Exception, task_name: str, connector_id: int) -> None:
|
||||
"""
|
||||
Handle greenlet_spawn errors with detailed logging for debugging.
|
||||
|
||||
The 'greenlet_spawn has not been called' error occurs when:
|
||||
1. SQLAlchemy lazy-loads a relationship outside of an async context
|
||||
2. A sync operation is called from an async context (or vice versa)
|
||||
3. Session objects are accessed after the session is closed
|
||||
|
||||
This helper logs detailed context to help identify the root cause.
|
||||
"""
|
||||
error_str = str(e)
|
||||
if "greenlet_spawn has not been called" in error_str:
|
||||
logger.error(
|
||||
f"GREENLET ERROR in {task_name} for connector {connector_id}: {error_str}\n"
|
||||
f"This error typically occurs when SQLAlchemy tries to lazy-load a relationship "
|
||||
f"outside of an async context. Check for:\n"
|
||||
f"1. Accessing relationship attributes (e.g., document.chunks, connector.search_space) "
|
||||
f"without using selectinload() or joinedload()\n"
|
||||
f"2. Accessing model attributes after the session is closed\n"
|
||||
f"3. Passing ORM objects between different async contexts\n"
|
||||
f"Stack trace:\n{traceback.format_exc()}"
|
||||
)
|
||||
else:
|
||||
logger.error(
|
||||
f"Error in {task_name} for connector {connector_id}: {error_str}\n"
|
||||
f"Stack trace:\n{traceback.format_exc()}"
|
||||
)
|
||||
|
||||
|
||||
def get_celery_session_maker():
|
||||
"""
|
||||
Create a new async session maker for Celery tasks.
|
||||
|
|
@ -46,6 +77,9 @@ def index_slack_messages_task(
|
|||
connector_id, search_space_id, user_id, start_date, end_date
|
||||
)
|
||||
)
|
||||
except Exception as e:
|
||||
_handle_greenlet_error(e, "index_slack_messages", connector_id)
|
||||
raise
|
||||
finally:
|
||||
loop.close()
|
||||
|
||||
|
|
@ -89,6 +123,9 @@ def index_notion_pages_task(
|
|||
connector_id, search_space_id, user_id, start_date, end_date
|
||||
)
|
||||
)
|
||||
except Exception as e:
|
||||
_handle_greenlet_error(e, "index_notion_pages", connector_id)
|
||||
raise
|
||||
finally:
|
||||
loop.close()
|
||||
|
||||
|
|
@ -347,6 +384,9 @@ def index_google_calendar_events_task(
|
|||
connector_id, search_space_id, user_id, start_date, end_date
|
||||
)
|
||||
)
|
||||
except Exception as e:
|
||||
_handle_greenlet_error(e, "index_google_calendar_events", connector_id)
|
||||
raise
|
||||
finally:
|
||||
loop.close()
|
||||
|
||||
|
|
@ -696,6 +736,9 @@ def index_crawled_urls_task(
|
|||
connector_id, search_space_id, user_id, start_date, end_date
|
||||
)
|
||||
)
|
||||
except Exception as e:
|
||||
_handle_greenlet_error(e, "index_crawled_urls", connector_id)
|
||||
raise
|
||||
finally:
|
||||
loop.close()
|
||||
|
||||
|
|
|
|||
|
|
@ -28,6 +28,34 @@ def get_current_timestamp() -> datetime:
|
|||
return datetime.now(UTC)
|
||||
|
||||
|
||||
def parse_date_flexible(date_str: str) -> datetime:
|
||||
"""
|
||||
Parse date from multiple common formats.
|
||||
|
||||
Args:
|
||||
date_str: Date string to parse
|
||||
|
||||
Returns:
|
||||
Parsed datetime object
|
||||
|
||||
Raises:
|
||||
ValueError: If unable to parse the date string
|
||||
"""
|
||||
formats = ["%Y-%m-%d", "%Y-%m-%dT%H:%M:%S"]
|
||||
|
||||
for fmt in formats:
|
||||
try:
|
||||
return datetime.strptime(date_str.rstrip("Z"), fmt)
|
||||
except ValueError:
|
||||
continue
|
||||
|
||||
# Try ISO format as fallback
|
||||
try:
|
||||
return datetime.fromisoformat(date_str.replace("Z", "+00:00"))
|
||||
except ValueError:
|
||||
raise ValueError(f"Unable to parse date: {date_str}")
|
||||
|
||||
|
||||
async def check_duplicate_document_by_hash(
|
||||
session: AsyncSession, content_hash: str
|
||||
) -> Document | None:
|
||||
|
|
@ -159,6 +187,26 @@ def calculate_date_range(
|
|||
)
|
||||
end_date_str = end_date if end_date else calculated_end_date.strftime("%Y-%m-%d")
|
||||
|
||||
# FIX: Ensure end_date is at least 1 day after start_date to avoid
|
||||
# "start_date must be strictly before end_date" errors when dates are the same
|
||||
# (e.g., when last_indexed_at is today)
|
||||
if start_date_str == end_date_str:
|
||||
logger.info(
|
||||
f"Start date ({start_date_str}) equals end date ({end_date_str}), "
|
||||
"adjusting end date to next day to ensure valid date range"
|
||||
)
|
||||
# Parse end_date and add 1 day
|
||||
try:
|
||||
end_dt = parse_date_flexible(end_date_str)
|
||||
except ValueError:
|
||||
logger.warning(
|
||||
f"Could not parse end_date '{end_date_str}', using current date"
|
||||
)
|
||||
end_dt = datetime.now()
|
||||
end_dt = end_dt + timedelta(days=1)
|
||||
end_date_str = end_dt.strftime("%Y-%m-%d")
|
||||
logger.info(f"Adjusted end date to {end_date_str}")
|
||||
|
||||
return start_date_str, end_date_str
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -27,6 +27,7 @@ from .base import (
|
|||
get_connector_by_id,
|
||||
get_current_timestamp,
|
||||
logger,
|
||||
parse_date_flexible,
|
||||
update_connector_last_indexed,
|
||||
)
|
||||
|
||||
|
|
@ -217,6 +218,26 @@ async def index_google_calendar_events(
|
|||
start_date_str = start_date
|
||||
end_date_str = end_date
|
||||
|
||||
# FIX: Ensure end_date is at least 1 day after start_date to avoid
|
||||
# "start_date must be strictly before end_date" errors when dates are the same
|
||||
# (e.g., when last_indexed_at is today)
|
||||
if start_date_str == end_date_str:
|
||||
logger.info(
|
||||
f"Start date ({start_date_str}) equals end date ({end_date_str}), "
|
||||
"adjusting end date to next day to ensure valid date range"
|
||||
)
|
||||
# Parse end_date and add 1 day
|
||||
try:
|
||||
end_dt = parse_date_flexible(end_date_str)
|
||||
except ValueError:
|
||||
logger.warning(
|
||||
f"Could not parse end_date '{end_date_str}', using current date"
|
||||
)
|
||||
end_dt = datetime.now()
|
||||
end_dt = end_dt + timedelta(days=1)
|
||||
end_date_str = end_dt.strftime("%Y-%m-%d")
|
||||
logger.info(f"Adjusted end date to {end_date_str}")
|
||||
|
||||
await task_logger.log_task_progress(
|
||||
log_entry,
|
||||
f"Fetching Google Calendar events from {start_date_str} to {end_date_str}",
|
||||
|
|
|
|||
|
|
@ -196,13 +196,44 @@ async def index_notion_pages(
|
|||
"Recommend reconnecting with OAuth."
|
||||
)
|
||||
except Exception as e:
|
||||
await task_logger.log_task_failure(
|
||||
log_entry,
|
||||
f"Failed to get Notion pages for connector {connector_id}",
|
||||
str(e),
|
||||
{"error_type": "PageFetchError"},
|
||||
error_str = str(e)
|
||||
# Check if this is an unsupported block type error (transcription, ai_block, etc.)
|
||||
# These are known Notion API limitations and should be logged as warnings, not errors
|
||||
unsupported_block_errors = [
|
||||
"transcription is not supported",
|
||||
"ai_block is not supported",
|
||||
"is not supported via the API",
|
||||
]
|
||||
is_unsupported_block_error = any(
|
||||
err in error_str.lower() for err in unsupported_block_errors
|
||||
)
|
||||
logger.error(f"Error fetching Notion pages: {e!s}", exc_info=True)
|
||||
|
||||
if is_unsupported_block_error:
|
||||
# Log as warning since this is a known Notion API limitation
|
||||
logger.warning(
|
||||
f"Notion API limitation for connector {connector_id}: {error_str}. "
|
||||
"This is a known issue with Notion AI blocks (transcription, ai_block) "
|
||||
"that are not accessible via the Notion API."
|
||||
)
|
||||
await task_logger.log_task_failure(
|
||||
log_entry,
|
||||
f"Failed to get Notion pages: Notion API limitation",
|
||||
f"{error_str} - This page contains Notion AI content (transcription/ai_block) that cannot be accessed via the API.",
|
||||
{"error_type": "UnsupportedBlockType", "is_known_limitation": True},
|
||||
)
|
||||
else:
|
||||
# Log as error for other failures
|
||||
logger.error(
|
||||
f"Error fetching Notion pages for connector {connector_id}: {error_str}",
|
||||
exc_info=True,
|
||||
)
|
||||
await task_logger.log_task_failure(
|
||||
log_entry,
|
||||
f"Failed to get Notion pages for connector {connector_id}",
|
||||
str(e),
|
||||
{"error_type": "PageFetchError"},
|
||||
)
|
||||
|
||||
await notion_client.close()
|
||||
return 0, f"Failed to get Notion pages: {e!s}"
|
||||
|
||||
|
|
|
|||
|
|
@ -108,10 +108,15 @@ async def index_crawled_urls(
|
|||
api_key = connector.config.get("FIRECRAWL_API_KEY")
|
||||
|
||||
# Get URLs from connector config
|
||||
urls = parse_webcrawler_urls(connector.config.get("INITIAL_URLS"))
|
||||
raw_initial_urls = connector.config.get("INITIAL_URLS")
|
||||
urls = parse_webcrawler_urls(raw_initial_urls)
|
||||
|
||||
# DEBUG: Log connector config details for troubleshooting empty URL issues
|
||||
logger.info(
|
||||
f"Starting crawled web page indexing for connector {connector_id} with {len(urls)} URLs"
|
||||
f"Starting crawled web page indexing for connector {connector_id} with {len(urls)} URLs. "
|
||||
f"Connector name: {connector.name}, "
|
||||
f"INITIAL_URLS type: {type(raw_initial_urls).__name__}, "
|
||||
f"INITIAL_URLS value: {repr(raw_initial_urls)[:200] if raw_initial_urls else 'None'}"
|
||||
)
|
||||
|
||||
# Initialize webcrawler client
|
||||
|
|
@ -128,11 +133,18 @@ async def index_crawled_urls(
|
|||
|
||||
# Validate URLs
|
||||
if not urls:
|
||||
# DEBUG: Log detailed connector config for troubleshooting
|
||||
logger.error(
|
||||
f"No URLs provided for indexing. Connector ID: {connector_id}, "
|
||||
f"Connector name: {connector.name}, "
|
||||
f"Config keys: {list(connector.config.keys()) if connector.config else 'None'}, "
|
||||
f"INITIAL_URLS raw value: {repr(raw_initial_urls)}"
|
||||
)
|
||||
await task_logger.log_task_failure(
|
||||
log_entry,
|
||||
"No URLs provided for indexing",
|
||||
"Empty URL list",
|
||||
{"error_type": "ValidationError"},
|
||||
f"Empty URL list. INITIAL_URLS value: {repr(raw_initial_urls)[:100]}",
|
||||
{"error_type": "ValidationError", "connector_name": connector.name},
|
||||
)
|
||||
return 0, "No URLs provided for indexing"
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue