mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-05-29 19:35:20 +02:00
feat: improve indexing logic and duplicate handling in connectors
- Enhanced Google Calendar and Composio connector indexing to track and log duplicate content, preventing re-indexing of already processed events. - Implemented robust error handling during final commits to manage integrity errors gracefully, ensuring successful indexing despite potential duplicates. - Updated notification service to differentiate between actual errors and warnings for duplicate content, improving user feedback. - Refactored date handling to ensure valid date ranges and adjusted end dates when necessary for better indexing accuracy.
This commit is contained in:
parent
d20bb385b5
commit
c48ba36fa4
6 changed files with 198 additions and 35 deletions
|
|
@ -18,7 +18,10 @@ from app.db import Document, DocumentType
|
||||||
from app.services.composio_service import TOOLKIT_TO_DOCUMENT_TYPE
|
from app.services.composio_service import TOOLKIT_TO_DOCUMENT_TYPE
|
||||||
from app.services.llm_service import get_user_long_context_llm
|
from app.services.llm_service import get_user_long_context_llm
|
||||||
from app.services.task_logging_service import TaskLoggingService
|
from app.services.task_logging_service import TaskLoggingService
|
||||||
from app.tasks.connector_indexers.base import calculate_date_range
|
from app.tasks.connector_indexers.base import (
|
||||||
|
calculate_date_range,
|
||||||
|
check_duplicate_document_by_hash,
|
||||||
|
)
|
||||||
from app.utils.document_converters import (
|
from app.utils.document_converters import (
|
||||||
create_document_chunks,
|
create_document_chunks,
|
||||||
generate_content_hash,
|
generate_content_hash,
|
||||||
|
|
@ -256,6 +259,7 @@ async def index_composio_google_calendar(
|
||||||
|
|
||||||
documents_indexed = 0
|
documents_indexed = 0
|
||||||
documents_skipped = 0
|
documents_skipped = 0
|
||||||
|
duplicate_content_count = 0 # Track events skipped due to duplicate content_hash
|
||||||
|
|
||||||
for event in events:
|
for event in events:
|
||||||
try:
|
try:
|
||||||
|
|
@ -349,7 +353,25 @@ async def index_composio_google_calendar(
|
||||||
logger.info(
|
logger.info(
|
||||||
f"Committing batch: {documents_indexed} Google Calendar events processed so far"
|
f"Committing batch: {documents_indexed} Google Calendar events processed so far"
|
||||||
)
|
)
|
||||||
await session.commit()
|
await session.commit( )
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Document doesn't exist by unique_identifier_hash
|
||||||
|
# Check if a document with the same content_hash exists (from standard connector)
|
||||||
|
with session.no_autoflush:
|
||||||
|
duplicate_by_content = await check_duplicate_document_by_hash(
|
||||||
|
session, content_hash
|
||||||
|
)
|
||||||
|
|
||||||
|
if duplicate_by_content:
|
||||||
|
# A document with the same content already exists (likely from standard connector)
|
||||||
|
logger.info(
|
||||||
|
f"Event {summary} already indexed by another connector "
|
||||||
|
f"(existing document ID: {duplicate_by_content.id}, "
|
||||||
|
f"type: {duplicate_by_content.document_type}). Skipping to avoid duplicate content."
|
||||||
|
)
|
||||||
|
duplicate_content_count += 1
|
||||||
|
documents_skipped += 1
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# Create new document
|
# Create new document
|
||||||
|
|
@ -429,10 +451,28 @@ async def index_composio_google_calendar(
|
||||||
logger.info(
|
logger.info(
|
||||||
f"Final commit: Total {documents_indexed} Google Calendar events processed"
|
f"Final commit: Total {documents_indexed} Google Calendar events processed"
|
||||||
)
|
)
|
||||||
await session.commit()
|
try:
|
||||||
logger.info(
|
await session.commit()
|
||||||
"Successfully committed all Composio Google Calendar document changes to database"
|
logger.info(
|
||||||
)
|
"Successfully committed all Composio Google Calendar document changes to database"
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
# Handle any remaining integrity errors gracefully (race conditions, etc.)
|
||||||
|
if "duplicate key value violates unique constraint" in str(e).lower() or "uniqueviolationerror" in str(e).lower():
|
||||||
|
logger.warning(
|
||||||
|
f"Duplicate content_hash detected during final commit. "
|
||||||
|
f"This may occur if the same event was indexed by multiple connectors. "
|
||||||
|
f"Rolling back and continuing. Error: {e!s}"
|
||||||
|
)
|
||||||
|
await session.rollback()
|
||||||
|
# Don't fail the entire task - some documents may have been successfully indexed
|
||||||
|
else:
|
||||||
|
raise
|
||||||
|
|
||||||
|
# Build warning message if duplicates were found
|
||||||
|
warning_message = None
|
||||||
|
if duplicate_content_count > 0:
|
||||||
|
warning_message = f"{duplicate_content_count} skipped (duplicate)"
|
||||||
|
|
||||||
await task_logger.log_task_success(
|
await task_logger.log_task_success(
|
||||||
log_entry,
|
log_entry,
|
||||||
|
|
@ -440,10 +480,15 @@ async def index_composio_google_calendar(
|
||||||
{
|
{
|
||||||
"documents_indexed": documents_indexed,
|
"documents_indexed": documents_indexed,
|
||||||
"documents_skipped": documents_skipped,
|
"documents_skipped": documents_skipped,
|
||||||
|
"duplicate_content_count": duplicate_content_count,
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
|
|
||||||
return documents_indexed, None
|
logger.info(
|
||||||
|
f"Composio Google Calendar indexing completed: {documents_indexed} new events, {documents_skipped} skipped "
|
||||||
|
f"({duplicate_content_count} due to duplicate content from other connectors)"
|
||||||
|
)
|
||||||
|
return documents_indexed, warning_message
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(
|
logger.error(
|
||||||
|
|
|
||||||
|
|
@ -22,6 +22,8 @@ import logging
|
||||||
from datetime import UTC, datetime, timedelta
|
from datetime import UTC, datetime, timedelta
|
||||||
from typing import Any
|
from typing import Any
|
||||||
|
|
||||||
|
import pytz
|
||||||
|
from dateutil.parser import isoparse
|
||||||
from fastapi import APIRouter, Body, Depends, HTTPException, Query
|
from fastapi import APIRouter, Body, Depends, HTTPException, Query
|
||||||
from pydantic import BaseModel, Field, ValidationError
|
from pydantic import BaseModel, Field, ValidationError
|
||||||
from sqlalchemy.exc import IntegrityError
|
from sqlalchemy.exc import IntegrityError
|
||||||
|
|
@ -681,6 +683,22 @@ async def index_connector_content(
|
||||||
]:
|
]:
|
||||||
# Default to today if no end_date provided (users can manually select future dates)
|
# Default to today if no end_date provided (users can manually select future dates)
|
||||||
indexing_to = today_str if end_date is None else end_date
|
indexing_to = today_str if end_date is None else end_date
|
||||||
|
|
||||||
|
# If start_date and end_date are the same, adjust end_date to be one day later
|
||||||
|
# to ensure valid date range (start_date must be strictly before end_date)
|
||||||
|
if indexing_from == indexing_to:
|
||||||
|
dt = isoparse(indexing_to)
|
||||||
|
if dt.tzinfo is None:
|
||||||
|
dt = dt.replace(tzinfo=pytz.UTC)
|
||||||
|
else:
|
||||||
|
dt = dt.astimezone(pytz.UTC)
|
||||||
|
# Add one day to end_date to make it strictly after start_date
|
||||||
|
dt_end = dt + timedelta(days=1)
|
||||||
|
indexing_to = dt_end.strftime("%Y-%m-%d")
|
||||||
|
logger.info(
|
||||||
|
f"Adjusted end_date from {end_date} to {indexing_to} "
|
||||||
|
f"to ensure valid date range (start_date must be strictly before end_date)"
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
# For non-calendar connectors, cap at today
|
# For non-calendar connectors, cap at today
|
||||||
indexing_to = end_date if end_date else today_str
|
indexing_to = end_date if end_date else today_str
|
||||||
|
|
@ -1231,20 +1249,48 @@ async def _run_indexing_with_notifications(
|
||||||
else:
|
else:
|
||||||
# No new documents processed - check if this is an error or just no changes
|
# No new documents processed - check if this is an error or just no changes
|
||||||
if error_or_warning:
|
if error_or_warning:
|
||||||
# Actual failure
|
# Check if this is a duplicate warning (success case) or an actual error
|
||||||
logger.error(f"Indexing failed: {error_or_warning}")
|
# Handle both normal and Composio calendar connectors
|
||||||
if notification:
|
error_or_warning_lower = str(error_or_warning).lower() if error_or_warning else ""
|
||||||
# Refresh notification to ensure it's not stale after indexing function commits
|
is_duplicate_warning = "skipped (duplicate)" in error_or_warning_lower
|
||||||
await session.refresh(notification)
|
|
||||||
await NotificationService.connector_indexing.notify_indexing_completed(
|
if is_duplicate_warning:
|
||||||
session=session,
|
# Duplicate warnings are success cases - sync worked, just found duplicates
|
||||||
notification=notification,
|
logger.info(
|
||||||
indexed_count=0,
|
f"Indexing completed successfully: {error_or_warning}"
|
||||||
error_message=error_or_warning,
|
|
||||||
)
|
)
|
||||||
await (
|
# Still update timestamp so ElectricSQL syncs and clears "Syncing" UI
|
||||||
session.commit()
|
if update_timestamp_func:
|
||||||
) # Commit to ensure Electric SQL syncs the notification update
|
await update_timestamp_func(session, connector_id)
|
||||||
|
await session.commit() # Commit timestamp update
|
||||||
|
if notification:
|
||||||
|
# Refresh notification to ensure it's not stale after timestamp update commit
|
||||||
|
await session.refresh(notification)
|
||||||
|
await NotificationService.connector_indexing.notify_indexing_completed(
|
||||||
|
session=session,
|
||||||
|
notification=notification,
|
||||||
|
indexed_count=0,
|
||||||
|
error_message=error_or_warning, # Pass as warning, not error
|
||||||
|
is_warning=True, # Flag to indicate this is a warning, not an error
|
||||||
|
)
|
||||||
|
await (
|
||||||
|
session.commit()
|
||||||
|
) # Commit to ensure Electric SQL syncs the notification update
|
||||||
|
else:
|
||||||
|
# Actual failure
|
||||||
|
logger.error(f"Indexing failed: {error_or_warning}")
|
||||||
|
if notification:
|
||||||
|
# Refresh notification to ensure it's not stale after indexing function commits
|
||||||
|
await session.refresh(notification)
|
||||||
|
await NotificationService.connector_indexing.notify_indexing_completed(
|
||||||
|
session=session,
|
||||||
|
notification=notification,
|
||||||
|
indexed_count=0,
|
||||||
|
error_message=error_or_warning,
|
||||||
|
)
|
||||||
|
await (
|
||||||
|
session.commit()
|
||||||
|
) # Commit to ensure Electric SQL syncs the notification update
|
||||||
else:
|
else:
|
||||||
# Success - just no new documents to index (all skipped/unchanged)
|
# Success - just no new documents to index (all skipped/unchanged)
|
||||||
logger.info(
|
logger.info(
|
||||||
|
|
|
||||||
|
|
@ -335,6 +335,7 @@ class ConnectorIndexingNotificationHandler(BaseNotificationHandler):
|
||||||
notification: Notification,
|
notification: Notification,
|
||||||
indexed_count: int,
|
indexed_count: int,
|
||||||
error_message: str | None = None,
|
error_message: str | None = None,
|
||||||
|
is_warning: bool = False,
|
||||||
) -> Notification:
|
) -> Notification:
|
||||||
"""
|
"""
|
||||||
Update notification when connector indexing completes.
|
Update notification when connector indexing completes.
|
||||||
|
|
@ -343,7 +344,8 @@ class ConnectorIndexingNotificationHandler(BaseNotificationHandler):
|
||||||
session: Database session
|
session: Database session
|
||||||
notification: Notification to update
|
notification: Notification to update
|
||||||
indexed_count: Total number of items indexed
|
indexed_count: Total number of items indexed
|
||||||
error_message: Error message if indexing failed (optional)
|
error_message: Error message if indexing failed, or warning message (optional)
|
||||||
|
is_warning: If True, treat error_message as a warning (success case) rather than an error
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
Updated notification
|
Updated notification
|
||||||
|
|
@ -352,10 +354,26 @@ class ConnectorIndexingNotificationHandler(BaseNotificationHandler):
|
||||||
"connector_name", "Connector"
|
"connector_name", "Connector"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# If there's an error message but items were indexed, treat it as a warning (partial success)
|
||||||
|
# If is_warning is True, treat it as success even with 0 items (e.g., duplicates found)
|
||||||
|
# Otherwise, treat it as a failure
|
||||||
if error_message:
|
if error_message:
|
||||||
title = f"Failed: {connector_name}"
|
if indexed_count > 0:
|
||||||
message = f"Sync failed: {error_message}"
|
# Partial success with warnings (e.g., duplicate content from other connectors)
|
||||||
status = "failed"
|
title = f"Ready: {connector_name}"
|
||||||
|
item_text = "item" if indexed_count == 1 else "items"
|
||||||
|
message = f"Now searchable! {indexed_count} {item_text} synced. Note: {error_message}"
|
||||||
|
status = "completed"
|
||||||
|
elif is_warning:
|
||||||
|
# Warning case (e.g., duplicates found) - treat as success
|
||||||
|
title = f"Ready: {connector_name}"
|
||||||
|
message = f"Sync completed. {error_message}"
|
||||||
|
status = "completed"
|
||||||
|
else:
|
||||||
|
# Complete failure
|
||||||
|
title = f"Failed: {connector_name}"
|
||||||
|
message = f"Sync failed: {error_message}"
|
||||||
|
status = "failed"
|
||||||
else:
|
else:
|
||||||
title = f"Ready: {connector_name}"
|
title = f"Ready: {connector_name}"
|
||||||
if indexed_count == 0:
|
if indexed_count == 0:
|
||||||
|
|
@ -367,7 +385,7 @@ class ConnectorIndexingNotificationHandler(BaseNotificationHandler):
|
||||||
|
|
||||||
metadata_updates = {
|
metadata_updates = {
|
||||||
"indexed_count": indexed_count,
|
"indexed_count": indexed_count,
|
||||||
"sync_stage": "completed" if not error_message else "failed",
|
"sync_stage": "completed" if (not error_message or is_warning or indexed_count > 0) else "failed",
|
||||||
"error_message": error_message,
|
"error_message": error_message,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -23,6 +23,7 @@ from app.utils.document_converters import (
|
||||||
|
|
||||||
from .base import (
|
from .base import (
|
||||||
check_document_by_unique_identifier,
|
check_document_by_unique_identifier,
|
||||||
|
check_duplicate_document_by_hash,
|
||||||
get_connector_by_id,
|
get_connector_by_id,
|
||||||
get_current_timestamp,
|
get_current_timestamp,
|
||||||
logger,
|
logger,
|
||||||
|
|
@ -289,6 +290,7 @@ async def index_google_calendar_events(
|
||||||
documents_indexed = 0
|
documents_indexed = 0
|
||||||
documents_skipped = 0
|
documents_skipped = 0
|
||||||
skipped_events = []
|
skipped_events = []
|
||||||
|
duplicate_content_count = 0 # Track events skipped due to duplicate content_hash
|
||||||
|
|
||||||
for event in events:
|
for event in events:
|
||||||
try:
|
try:
|
||||||
|
|
@ -409,6 +411,27 @@ async def index_google_calendar_events(
|
||||||
)
|
)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
# Document doesn't exist by unique_identifier_hash
|
||||||
|
# Check if a document with the same content_hash exists (from another connector)
|
||||||
|
with session.no_autoflush:
|
||||||
|
duplicate_by_content = await check_duplicate_document_by_hash(
|
||||||
|
session, content_hash
|
||||||
|
)
|
||||||
|
|
||||||
|
if duplicate_by_content:
|
||||||
|
# A document with the same content already exists (likely from Composio connector)
|
||||||
|
logger.info(
|
||||||
|
f"Event {event_summary} already indexed by another connector "
|
||||||
|
f"(existing document ID: {duplicate_by_content.id}, "
|
||||||
|
f"type: {duplicate_by_content.document_type}). Skipping to avoid duplicate content."
|
||||||
|
)
|
||||||
|
duplicate_content_count += 1
|
||||||
|
documents_skipped += 1
|
||||||
|
skipped_events.append(
|
||||||
|
f"{event_summary} (already indexed by another connector)"
|
||||||
|
)
|
||||||
|
continue
|
||||||
|
|
||||||
# Document doesn't exist - create new one
|
# Document doesn't exist - create new one
|
||||||
# Generate summary with metadata
|
# Generate summary with metadata
|
||||||
user_llm = await get_user_long_context_llm(
|
user_llm = await get_user_long_context_llm(
|
||||||
|
|
@ -501,7 +524,25 @@ async def index_google_calendar_events(
|
||||||
logger.info(
|
logger.info(
|
||||||
f"Final commit: Total {documents_indexed} Google Calendar events processed"
|
f"Final commit: Total {documents_indexed} Google Calendar events processed"
|
||||||
)
|
)
|
||||||
await session.commit()
|
try:
|
||||||
|
await session.commit()
|
||||||
|
except Exception as e:
|
||||||
|
# Handle any remaining integrity errors gracefully (race conditions, etc.)
|
||||||
|
if "duplicate key value violates unique constraint" in str(e).lower() or "uniqueviolationerror" in str(e).lower():
|
||||||
|
logger.warning(
|
||||||
|
f"Duplicate content_hash detected during final commit. "
|
||||||
|
f"This may occur if the same event was indexed by multiple connectors. "
|
||||||
|
f"Rolling back and continuing. Error: {e!s}"
|
||||||
|
)
|
||||||
|
await session.rollback()
|
||||||
|
# Don't fail the entire task - some documents may have been successfully indexed
|
||||||
|
else:
|
||||||
|
raise
|
||||||
|
|
||||||
|
# Build warning message if duplicates were found
|
||||||
|
warning_message = None
|
||||||
|
if duplicate_content_count > 0:
|
||||||
|
warning_message = f"{duplicate_content_count} skipped (duplicate)"
|
||||||
|
|
||||||
await task_logger.log_task_success(
|
await task_logger.log_task_success(
|
||||||
log_entry,
|
log_entry,
|
||||||
|
|
@ -510,14 +551,16 @@ async def index_google_calendar_events(
|
||||||
"events_processed": total_processed,
|
"events_processed": total_processed,
|
||||||
"documents_indexed": documents_indexed,
|
"documents_indexed": documents_indexed,
|
||||||
"documents_skipped": documents_skipped,
|
"documents_skipped": documents_skipped,
|
||||||
|
"duplicate_content_count": duplicate_content_count,
|
||||||
"skipped_events_count": len(skipped_events),
|
"skipped_events_count": len(skipped_events),
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
|
|
||||||
logger.info(
|
logger.info(
|
||||||
f"Google Calendar indexing completed: {documents_indexed} new events, {documents_skipped} skipped"
|
f"Google Calendar indexing completed: {documents_indexed} new events, {documents_skipped} skipped "
|
||||||
|
f"({duplicate_content_count} due to duplicate content from other connectors)"
|
||||||
)
|
)
|
||||||
return total_processed, None
|
return total_processed, warning_message
|
||||||
|
|
||||||
except SQLAlchemyError as db_error:
|
except SQLAlchemyError as db_error:
|
||||||
await session.rollback()
|
await session.rollback()
|
||||||
|
|
|
||||||
|
|
@ -100,10 +100,14 @@ export const ConnectorEditView: FC<ConnectorEditViewProps> = ({
|
||||||
|
|
||||||
// Reset local quick indexing state when indexing completes or fails
|
// Reset local quick indexing state when indexing completes or fails
|
||||||
useEffect(() => {
|
useEffect(() => {
|
||||||
if (!isIndexing) {
|
if (!isIndexing && isQuickIndexing) {
|
||||||
setIsQuickIndexing(false);
|
// Small delay to ensure smooth transition
|
||||||
|
const timer = setTimeout(() => {
|
||||||
|
setIsQuickIndexing(false);
|
||||||
|
}, 100);
|
||||||
|
return () => clearTimeout(timer);
|
||||||
}
|
}
|
||||||
}, [isIndexing]);
|
}, [isIndexing, isQuickIndexing]);
|
||||||
|
|
||||||
const handleDisconnectClick = () => {
|
const handleDisconnectClick = () => {
|
||||||
setShowDisconnectConfirm(true);
|
setShowDisconnectConfirm(true);
|
||||||
|
|
@ -119,11 +123,11 @@ export const ConnectorEditView: FC<ConnectorEditViewProps> = ({
|
||||||
};
|
};
|
||||||
|
|
||||||
const handleQuickIndex = useCallback(() => {
|
const handleQuickIndex = useCallback(() => {
|
||||||
if (onQuickIndex) {
|
if (onQuickIndex && !isQuickIndexing && !isIndexing) {
|
||||||
setIsQuickIndexing(true);
|
setIsQuickIndexing(true);
|
||||||
onQuickIndex();
|
onQuickIndex();
|
||||||
}
|
}
|
||||||
}, [onQuickIndex]);
|
}, [onQuickIndex, isQuickIndexing, isIndexing]);
|
||||||
|
|
||||||
return (
|
return (
|
||||||
<div className="flex-1 flex flex-col min-h-0 overflow-hidden">
|
<div className="flex-1 flex flex-col min-h-0 overflow-hidden">
|
||||||
|
|
|
||||||
|
|
@ -1409,7 +1409,12 @@ export const useConnectorDialog = () => {
|
||||||
startDate?: Date,
|
startDate?: Date,
|
||||||
endDate?: Date
|
endDate?: Date
|
||||||
) => {
|
) => {
|
||||||
if (!searchSpaceId) return;
|
if (!searchSpaceId) {
|
||||||
|
if (stopIndexing) {
|
||||||
|
stopIndexing(connectorId);
|
||||||
|
}
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
// Track quick index clicked event
|
// Track quick index clicked event
|
||||||
if (connectorType) {
|
if (connectorType) {
|
||||||
|
|
@ -1437,6 +1442,8 @@ export const useConnectorDialog = () => {
|
||||||
queryClient.invalidateQueries({
|
queryClient.invalidateQueries({
|
||||||
queryKey: cacheKeys.logs.summary(Number(searchSpaceId)),
|
queryKey: cacheKeys.logs.summary(Number(searchSpaceId)),
|
||||||
});
|
});
|
||||||
|
// Note: Don't call stopIndexing here - let useIndexingConnectors hook
|
||||||
|
// detect when last_indexed_at changes via Electric SQL
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
console.error("Error indexing connector content:", error);
|
console.error("Error indexing connector content:", error);
|
||||||
toast.error(error instanceof Error ? error.message : "Failed to start indexing");
|
toast.error(error instanceof Error ? error.message : "Failed to start indexing");
|
||||||
|
|
@ -1446,7 +1453,7 @@ export const useConnectorDialog = () => {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
[searchSpaceId, indexConnector]
|
[searchSpaceId, indexConnector, queryClient]
|
||||||
);
|
);
|
||||||
|
|
||||||
// Handle going back from edit view
|
// Handle going back from edit view
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue