feat: enhance date handling and indexing logic across connectors

- Added normalization for "undefined" strings to None in date parameters to prevent parsing errors.
- Improved date range validation to ensure start_date is strictly before end_date, adjusting end_date if necessary.
- Updated Google Calendar and Composio connector indexing logic to handle duplicate content more effectively, logging warnings for skipped events.
- Enhanced error handling during final commits to manage integrity errors gracefully.
- Refactored date handling in various connector indexers for consistency and reliability.
This commit is contained in:
Anish Sarkar 2026-01-23 23:03:29 +05:30
parent 08f16b43d7
commit d20bb385b5
9 changed files with 83 additions and 13 deletions

View file

@ -285,6 +285,13 @@ class GoogleGmailConnector:
try:
from datetime import datetime, timedelta
# Normalize date values - handle "undefined" strings from frontend
# This prevents "time data 'undefined' does not match format" errors
if start_date == "undefined" or start_date == "":
start_date = None
if end_date == "undefined" or end_date == "":
end_date = None
# Build date query
query_parts = []

View file

@ -644,20 +644,30 @@ async def index_connector_content(
# Handle different connector types
response_message = ""
today_str = datetime.now().strftime("%Y-%m-%d")
# Use UTC for consistency with last_indexed_at storage
today_str = datetime.now(UTC).strftime("%Y-%m-%d")
# Determine the actual date range to use
if start_date is None:
# Use last_indexed_at or default to 365 days ago
if connector.last_indexed_at:
today = datetime.now().date()
if connector.last_indexed_at.date() == today:
# Convert last_indexed_at to timezone-naive for comparison (like calculate_date_range does)
last_indexed_naive = (
connector.last_indexed_at.replace(tzinfo=None)
if connector.last_indexed_at.tzinfo
else connector.last_indexed_at
)
# Use UTC for "today" to match how last_indexed_at is stored
today_utc = datetime.now(UTC).replace(tzinfo=None).date()
last_indexed_date = last_indexed_naive.date()
if last_indexed_date == today_utc:
# If last indexed today, go back 1 day to ensure we don't miss anything
indexing_from = (today - timedelta(days=1)).strftime("%Y-%m-%d")
indexing_from = (today_utc - timedelta(days=1)).strftime("%Y-%m-%d")
else:
indexing_from = connector.last_indexed_at.strftime("%Y-%m-%d")
indexing_from = last_indexed_naive.strftime("%Y-%m-%d")
else:
indexing_from = (datetime.now() - timedelta(days=365)).strftime(
indexing_from = (datetime.now(UTC).replace(tzinfo=None) - timedelta(days=365)).strftime(
"%Y-%m-%d"
)
else:
@ -666,6 +676,7 @@ async def index_connector_content(
# For calendar connectors, default to today but allow future dates if explicitly provided
if connector.connector_type in [
SearchSourceConnectorType.GOOGLE_CALENDAR_CONNECTOR,
SearchSourceConnectorType.COMPOSIO_GOOGLE_CALENDAR_CONNECTOR,
SearchSourceConnectorType.LUMA_CONNECTOR,
]:
# Default to today if no end_date provided (users can manually select future dates)
@ -977,6 +988,9 @@ async def index_connector_content(
index_composio_connector_task,
)
# For Composio Gmail and Calendar, use the same date calculation logic as normal connectors
# This ensures consistent behavior and uses last_indexed_at to reduce API calls
# (includes special case: if indexed today, go back 1 day to avoid missing data)
logger.info(
f"Triggering Composio connector indexing for connector {connector_id} into search space {search_space_id} from {indexing_from} to {indexing_to}"
)

View file

@ -112,6 +112,13 @@ def calculate_date_range(
Returns:
Tuple of (start_date_str, end_date_str)
"""
# Normalize "undefined" strings to None (from frontend)
# This prevents parsing errors and ensures consistent behavior across all indexers
if start_date == "undefined" or start_date == "":
start_date = None
if end_date == "undefined" or end_date == "":
end_date = None
if start_date is not None and end_date is not None:
return start_date, end_date

View file

@ -4,6 +4,8 @@ Google Calendar connector indexer.
from datetime import datetime, timedelta
import pytz
from dateutil.parser import isoparse
from google.oauth2.credentials import Credentials
from sqlalchemy.exc import SQLAlchemyError
from sqlalchemy.ext.asyncio import AsyncSession
@ -205,6 +207,23 @@ async def index_google_calendar_events(
# Use provided dates (including future dates)
start_date_str = start_date
end_date_str = end_date
# If start_date and end_date are the same, adjust end_date to be one day later
# to ensure valid date range (start_date must be strictly before end_date)
if start_date_str == end_date_str:
# Parse the date and add one day to ensure valid range
dt = isoparse(end_date_str)
if dt.tzinfo is None:
dt = dt.replace(tzinfo=pytz.UTC)
else:
dt = dt.astimezone(pytz.UTC)
# Add one day to end_date to make it strictly after start_date
dt_end = dt + timedelta(days=1)
end_date_str = dt_end.strftime("%Y-%m-%d")
logger.info(
f"Adjusted end_date from {end_date} to {end_date_str} "
f"to ensure valid date range (start_date must be strictly before end_date)"
)
await task_logger.log_task_progress(
log_entry,

View file

@ -116,6 +116,13 @@ async def index_luma_events(
luma_client = LumaConnector(api_key=api_key)
# Handle 'undefined' string from frontend (treat as None)
# This prevents "time data 'undefined' does not match format" errors
if start_date == "undefined" or start_date == "":
start_date = None
if end_date == "undefined" or end_date == "":
end_date = None
# Calculate date range
# For calendar connectors, allow future dates to index upcoming events
if start_date is None or end_date is None:

View file

@ -259,7 +259,13 @@ export const ConnectorIndicator: FC = () => {
editingConnector.connector_type !== "GOOGLE_DRIVE_CONNECTOR"
? () => {
startIndexing(editingConnector.id);
handleQuickIndexConnector(editingConnector.id, editingConnector.connector_type, stopIndexing);
handleQuickIndexConnector(
editingConnector.id,
editingConnector.connector_type,
stopIndexing,
startDate,
endDate
);
}
: undefined
}

View file

@ -272,8 +272,7 @@ export const ConnectorEditView: FC<ConnectorEditViewProps> = ({
Re-indexing runs in the background
</p>
<p className="text-muted-foreground mt-1 text-[10px] sm:text-sm">
You can continue using SurfSense while we sync your data. Check the Active tab
to see progress.
You can continue using SurfSense while we sync your data. Check inbox for updates.
</p>
</div>
</div>

View file

@ -189,8 +189,7 @@ export const IndexingConfigurationView: FC<IndexingConfigurationViewProps> = ({
<div className="text-xs sm:text-sm">
<p className="font-medium text-xs sm:text-sm">Indexing runs in the background</p>
<p className="text-muted-foreground mt-1 text-[10px] sm:text-sm">
You can continue using SurfSense while we sync your data. Check the Active tab
to see progress.
You can continue using SurfSense while we sync your data. Check inbox for updates.
</p>
</div>
</div>

View file

@ -1400,9 +1400,15 @@ export const useConnectorDialog = () => {
[editingConnector, searchSpaceId, deleteConnector, router, cameFromMCPList]
);
// Handle quick index (index without date picker, uses backend defaults)
// Handle quick index (index with selected date range, or backend defaults if none selected)
const handleQuickIndexConnector = useCallback(
async (connectorId: number, connectorType?: string, stopIndexing?: (id: number) => void) => {
async (
connectorId: number,
connectorType?: string,
stopIndexing?: (id: number) => void,
startDate?: Date,
endDate?: Date
) => {
if (!searchSpaceId) return;
// Track quick index clicked event
@ -1411,10 +1417,16 @@ export const useConnectorDialog = () => {
}
try {
// Format dates if provided, otherwise pass undefined (backend will use defaults)
const startDateStr = startDate ? format(startDate, "yyyy-MM-dd") : undefined;
const endDateStr = endDate ? format(endDate, "yyyy-MM-dd") : undefined;
await indexConnector({
connector_id: connectorId,
queryParams: {
search_space_id: searchSpaceId,
start_date: startDateStr,
end_date: endDateStr,
},
});
toast.success("Indexing started", {