feat: enhance Google Drive indexing with new options

- Updated the Google Drive indexing functionality to include indexing options such as max files per folder, incremental sync, and inclusion of subfolders. - Modified the API to accept a new 'indexing_options' parameter in the request body. - Enhanced the UI to allow users to configure these options when selecting folders and files for indexing. - Updated related components and tasks to support the new indexing options, ensuring a more flexible and efficient indexing process.
2026-04-30 03:16:25 +02:00 · 2026-01-17 12:33:57 +05:30 · 2026-01-17 12:33:57 +05:30 · a3112a24fe
commit a3112a24fe
parent cf53338119
9 changed files with 381 additions and 178 deletions
--- a/surfsense_backend/app/tasks/celery_tasks/connector_tasks.py
+++ b/surfsense_backend/app/tasks/celery_tasks/connector_tasks.py
@ -461,7 +461,7 @@ def index_google_drive_files_task(
    connector_id: int,
    search_space_id: int,
    user_id: str,
-    items_dict: dict,  # Dictionary with 'folders' and 'files' lists
+    items_dict: dict,  # Dictionary with 'folders', 'files', and 'indexing_options'
 ):
    """Celery task to index Google Drive folders and files."""
    import asyncio
@ -486,7 +486,7 @@ async def _index_google_drive_files(
    connector_id: int,
    search_space_id: int,
    user_id: str,
-    items_dict: dict,  # Dictionary with 'folders' and 'files' lists
+    items_dict: dict,  # Dictionary with 'folders', 'files', and 'indexing_options'
 ):
    """Index Google Drive folders and files with new session."""
    from app.routes.search_source_connectors_routes import (
--- a/surfsense_backend/app/tasks/celery_tasks/schedule_checker_task.py
+++ b/surfsense_backend/app/tasks/celery_tasks/schedule_checker_task.py
@ -72,6 +72,7 @@ async def _check_and_trigger_schedules():
                index_elasticsearch_documents_task,
                index_github_repos_task,
                index_google_calendar_events_task,
+                index_google_drive_files_task,
                index_google_gmail_messages_task,
                index_jira_issues_task,
                index_linear_issues_task,
@ -96,6 +97,7 @@ async def _check_and_trigger_schedules():
                SearchSourceConnectorType.LUMA_CONNECTOR: index_luma_events_task,
                SearchSourceConnectorType.ELASTICSEARCH_CONNECTOR: index_elasticsearch_documents_task,
                SearchSourceConnectorType.WEBCRAWLER_CONNECTOR: index_crawled_urls_task,
+                SearchSourceConnectorType.GOOGLE_DRIVE_CONNECTOR: index_google_drive_files_task,
            }

            # Trigger indexing for each due connector
@ -106,13 +108,42 @@ async def _check_and_trigger_schedules():
                        f"Triggering periodic indexing for connector {connector.id} "
                        f"({connector.connector_type.value})"
                    )
-                    task.delay(
-                        connector.id,
-                        connector.search_space_id,
-                        str(connector.user_id),
-                        None,  # start_date - uses last_indexed_at
-                        None,  # end_date - uses now
-                    )
+
+                    # Special handling for Google Drive - uses config for folder/file selection
+                    if connector.connector_type == SearchSourceConnectorType.GOOGLE_DRIVE_CONNECTOR:
+                        config = connector.config or {}
+                        selected_folders = config.get("selected_folders", [])
+                        selected_files = config.get("selected_files", [])
+                        indexing_options = config.get("indexing_options", {
+                            "max_files_per_folder": 100,
+                            "incremental_sync": True,
+                            "include_subfolders": True,
+                        })
+
+                        if selected_folders or selected_files:
+                            task.delay(
+                                connector.id,
+                                connector.search_space_id,
+                                str(connector.user_id),
+                                {
+                                    "folders": selected_folders,
+                                    "files": selected_files,
+                                    "indexing_options": indexing_options,
+                                },
+                            )
+                        else:
+                            logger.warning(
+                                f"Google Drive connector {connector.id} has no folders or files selected, skipping periodic indexing"
+                            )
+                            continue
+                    else:
+                        task.delay(
+                            connector.id,
+                            connector.search_space_id,
+                            str(connector.user_id),
+                            None,  # start_date - uses last_indexed_at
+                            None,  # end_date - uses now
+                        )

                    # Update next_scheduled_at for next run
                    from datetime import timedelta