From f54079643f61fbdbbfc52eb067eb1deed1bd7d76 Mon Sep 17 00:00:00 2001 From: CREDO23 Date: Sun, 28 Dec 2025 15:53:35 +0200 Subject: [PATCH 01/39] feat(db): add GOOGLE_DRIVE_CONNECTOR to DocumentType and SearchSourceConnectorType enums --- surfsense_backend/app/db.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/surfsense_backend/app/db.py b/surfsense_backend/app/db.py index a2a424c26..a6bc3b938 100644 --- a/surfsense_backend/app/db.py +++ b/surfsense_backend/app/db.py @@ -46,6 +46,7 @@ class DocumentType(str, Enum): CLICKUP_CONNECTOR = "CLICKUP_CONNECTOR" GOOGLE_CALENDAR_CONNECTOR = "GOOGLE_CALENDAR_CONNECTOR" GOOGLE_GMAIL_CONNECTOR = "GOOGLE_GMAIL_CONNECTOR" + GOOGLE_DRIVE_CONNECTOR = "GOOGLE_DRIVE_CONNECTOR" AIRTABLE_CONNECTOR = "AIRTABLE_CONNECTOR" LUMA_CONNECTOR = "LUMA_CONNECTOR" ELASTICSEARCH_CONNECTOR = "ELASTICSEARCH_CONNECTOR" @@ -69,6 +70,7 @@ class SearchSourceConnectorType(str, Enum): CLICKUP_CONNECTOR = "CLICKUP_CONNECTOR" GOOGLE_CALENDAR_CONNECTOR = "GOOGLE_CALENDAR_CONNECTOR" GOOGLE_GMAIL_CONNECTOR = "GOOGLE_GMAIL_CONNECTOR" + GOOGLE_DRIVE_CONNECTOR = "GOOGLE_DRIVE_CONNECTOR" AIRTABLE_CONNECTOR = "AIRTABLE_CONNECTOR" LUMA_CONNECTOR = "LUMA_CONNECTOR" ELASTICSEARCH_CONNECTOR = "ELASTICSEARCH_CONNECTOR" From 5dd88386383c7f53dc42b253e35c32cedefaed5e Mon Sep 17 00:00:00 2001 From: CREDO23 Date: Sun, 28 Dec 2025 15:53:44 +0200 Subject: [PATCH 02/39] feat(db): add idempotent Alembic migration for GOOGLE_DRIVE_CONNECTOR enums --- .../54_add_google_drive_connector_enums.py | 74 +++++++++++++++++++ 1 file changed, 74 insertions(+) create mode 100644 surfsense_backend/alembic/versions/54_add_google_drive_connector_enums.py diff --git a/surfsense_backend/alembic/versions/54_add_google_drive_connector_enums.py b/surfsense_backend/alembic/versions/54_add_google_drive_connector_enums.py new file mode 100644 index 000000000..8e7d69340 --- /dev/null +++ b/surfsense_backend/alembic/versions/54_add_google_drive_connector_enums.py @@ -0,0 +1,74 @@ +"""Add Google Drive connector enums + +Revision ID: 54 +Revises: 53 +Create Date: 2025-12-28 12:00:00.000000 + +""" + +from collections.abc import Sequence + +from alembic import op + +# revision identifiers, used by Alembic. +revision: str = "54" +down_revision: str | None = "53" +branch_labels: str | Sequence[str] | None = None +depends_on: str | Sequence[str] | None = None + + +def upgrade() -> None: + """Safely add 'GOOGLE_DRIVE_CONNECTOR' to enum types if missing.""" + + # Add to searchsourceconnectortype enum + op.execute( + """ + DO $$ + BEGIN + IF NOT EXISTS ( + SELECT 1 FROM pg_type t + JOIN pg_enum e ON t.oid = e.enumtypid + WHERE t.typname = 'searchsourceconnectortype' AND e.enumlabel = 'GOOGLE_DRIVE_CONNECTOR' + ) THEN + ALTER TYPE searchsourceconnectortype ADD VALUE 'GOOGLE_DRIVE_CONNECTOR'; + END IF; + END + $$; + """ + ) + + # Add to documenttype enum + op.execute( + """ + DO $$ + BEGIN + IF NOT EXISTS ( + SELECT 1 FROM pg_type t + JOIN pg_enum e ON t.oid = e.enumtypid + WHERE t.typname = 'documenttype' AND e.enumlabel = 'GOOGLE_DRIVE_CONNECTOR' + ) THEN + ALTER TYPE documenttype ADD VALUE 'GOOGLE_DRIVE_CONNECTOR'; + END IF; + END + $$; + """ + ) + + +def downgrade() -> None: + """Remove 'GOOGLE_DRIVE_CONNECTOR' from enum types. + + Note: PostgreSQL doesn't support removing enum values directly. + This would require recreating the enum type, which is complex and risky. + For now, we'll leave the enum values in place. + + In a production environment with strict downgrade requirements, you would need to: + 1. Create new enum types without the value + 2. Convert all columns to use the new type + 3. Drop the old enum type + 4. Rename the new type to the old name + + This is left as pass to avoid accidental data loss. + """ + pass + From 28979851270674e2d855d46e94456c91d0d9d89b Mon Sep 17 00:00:00 2001 From: CREDO23 Date: Sun, 28 Dec 2025 15:53:51 +0200 Subject: [PATCH 03/39] feat(config): add GOOGLE_DRIVE_REDIRECT_URI environment variable --- surfsense_backend/app/config/__init__.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/surfsense_backend/app/config/__init__.py b/surfsense_backend/app/config/__init__.py index 08be26de1..9c503fb18 100644 --- a/surfsense_backend/app/config/__init__.py +++ b/surfsense_backend/app/config/__init__.py @@ -82,6 +82,9 @@ class Config: # Google Gmail redirect URI GOOGLE_GMAIL_REDIRECT_URI = os.getenv("GOOGLE_GMAIL_REDIRECT_URI") + # Google Drive redirect URI + GOOGLE_DRIVE_REDIRECT_URI = os.getenv("GOOGLE_DRIVE_REDIRECT_URI") + # Airtable OAuth AIRTABLE_CLIENT_ID = os.getenv("AIRTABLE_CLIENT_ID") AIRTABLE_CLIENT_SECRET = os.getenv("AIRTABLE_CLIENT_SECRET") From 2c8717b14bf8455bfc113bbe13a0db793f9a8c99 Mon Sep 17 00:00:00 2001 From: CREDO23 Date: Sun, 28 Dec 2025 15:54:26 +0200 Subject: [PATCH 04/39] feat(connectors): add Google Drive credentials module for OAuth management - Handle Google OAuth credential initialization and validation - Automatic token refresh with database persistence - Reuse existing tokens when valid --- .../app/connectors/google_drive/__init__.py | 24 ++++ .../connectors/google_drive/credentials.py | 109 ++++++++++++++++++ 2 files changed, 133 insertions(+) create mode 100644 surfsense_backend/app/connectors/google_drive/__init__.py create mode 100644 surfsense_backend/app/connectors/google_drive/credentials.py diff --git a/surfsense_backend/app/connectors/google_drive/__init__.py b/surfsense_backend/app/connectors/google_drive/__init__.py new file mode 100644 index 000000000..c50135155 --- /dev/null +++ b/surfsense_backend/app/connectors/google_drive/__init__.py @@ -0,0 +1,24 @@ +""" +Google Drive Connector Module. + +Simple, modular approach to Google Drive indexing. +""" + +from .change_tracker import categorize_change, fetch_all_changes, get_start_page_token +from .client import GoogleDriveClient +from .content_extractor import download_and_process_file +from .credentials import get_valid_credentials, validate_credentials +from .folder_manager import get_files_in_folder, list_folder_contents + +__all__ = [ + "GoogleDriveClient", + "get_valid_credentials", + "validate_credentials", + "download_and_process_file", + "get_files_in_folder", + "list_folder_contents", + "get_start_page_token", + "fetch_all_changes", + "categorize_change", +] + diff --git a/surfsense_backend/app/connectors/google_drive/credentials.py b/surfsense_backend/app/connectors/google_drive/credentials.py new file mode 100644 index 000000000..5d09df881 --- /dev/null +++ b/surfsense_backend/app/connectors/google_drive/credentials.py @@ -0,0 +1,109 @@ +""" +Google Drive OAuth Credentials Management. + +Handles credential validation, token refresh, and persistence to database. +Small, focused module for credential operations only. +""" + +import json +from datetime import datetime + +from google.auth.transport.requests import Request +from google.oauth2.credentials import Credentials +from sqlalchemy.ext.asyncio import AsyncSession +from sqlalchemy.future import select +from sqlalchemy.orm.attributes import flag_modified + +from app.db import SearchSourceConnector, SearchSourceConnectorType + + +async def get_valid_credentials( + session: AsyncSession, + connector_id: int, +) -> Credentials: + """ + Get valid Google OAuth credentials, refreshing if needed. + + Args: + session: Database session + connector_id: Connector ID + + Returns: + Valid Google OAuth credentials + + Raises: + ValueError: If credentials are missing or invalid + Exception: If token refresh fails + """ + # Fetch connector from database + result = await session.execute( + select(SearchSourceConnector).filter( + SearchSourceConnector.id == connector_id + ) + ) + connector = result.scalars().first() + + if not connector: + raise ValueError(f"Connector {connector_id} not found") + + # Extract credentials from config + config_data = connector.config + exp = config_data.get("expiry", "").replace("Z", "") + + # Validate required fields + if not all( + [ + config_data.get("client_id"), + config_data.get("client_secret"), + config_data.get("refresh_token"), + ] + ): + raise ValueError( + "Google OAuth credentials (client_id, client_secret, refresh_token) must be set" + ) + + # Create credentials object + credentials = Credentials( + token=config_data.get("token"), + refresh_token=config_data.get("refresh_token"), + token_uri=config_data.get("token_uri"), + client_id=config_data.get("client_id"), + client_secret=config_data.get("client_secret"), + scopes=config_data.get("scopes", []), + expiry=datetime.fromisoformat(exp) if exp else None, + ) + + # Refresh token if expired + if credentials.expired or not credentials.valid: + try: + credentials.refresh(Request()) + + # Persist refreshed token to database + connector.config = json.loads(credentials.to_json()) + flag_modified(connector, "config") + await session.commit() + + except Exception as e: + raise Exception(f"Failed to refresh Google OAuth credentials: {e!s}") from e + + return credentials + + +def validate_credentials(credentials: Credentials) -> bool: + """ + Validate that credentials have required fields. + + Args: + credentials: Google OAuth credentials + + Returns: + True if valid, False otherwise + """ + return all( + [ + credentials.client_id, + credentials.client_secret, + credentials.refresh_token, + ] + ) + From 74386affdcebbdf422235b94a67729f7f73b4304 Mon Sep 17 00:00:00 2001 From: CREDO23 Date: Sun, 28 Dec 2025 15:54:32 +0200 Subject: [PATCH 05/39] feat(connectors): add Google Drive API client wrapper - Build and manage Google Drive service with credentials - List files with query support and pagination - Download binary files and export Google Workspace files as PDF - Handle HTTP errors gracefully --- .../app/connectors/google_drive/client.py | 194 ++++++++++++++++++ 1 file changed, 194 insertions(+) create mode 100644 surfsense_backend/app/connectors/google_drive/client.py diff --git a/surfsense_backend/app/connectors/google_drive/client.py b/surfsense_backend/app/connectors/google_drive/client.py new file mode 100644 index 000000000..6d2d0abfd --- /dev/null +++ b/surfsense_backend/app/connectors/google_drive/client.py @@ -0,0 +1,194 @@ +""" +Google Drive API Client. + +Core client for interacting with Google Drive API. +Handles service initialization and basic file operations. +""" + +from typing import Any + +from google.oauth2.credentials import Credentials +from googleapiclient.discovery import build +from googleapiclient.errors import HttpError +from sqlalchemy.ext.asyncio import AsyncSession + +from .credentials import get_valid_credentials + + +class GoogleDriveClient: + """ + Main client for Google Drive API operations. + + Handles service initialization and provides methods for + listing files, getting metadata, and downloading content. + """ + + def __init__(self, session: AsyncSession, connector_id: int): + """ + Initialize Google Drive client. + + Args: + session: Database session + connector_id: ID of the Drive connector + """ + self.session = session + self.connector_id = connector_id + self.service = None + + async def get_service(self): + """ + Get or create the Drive service instance. + + Returns: + Google Drive service instance + + Raises: + Exception: If service creation fails + """ + if self.service: + return self.service + + try: + credentials = await get_valid_credentials(self.session, self.connector_id) + self.service = build("drive", "v3", credentials=credentials) + return self.service + except Exception as e: + raise Exception(f"Failed to create Google Drive service: {e!s}") from e + + async def list_files( + self, + query: str = "", + fields: str = "nextPageToken, files(id, name, mimeType, modifiedTime, size, webViewLink, parents, owners, createdTime, description)", + page_size: int = 100, + page_token: str | None = None, + ) -> tuple[list[dict[str, Any]], str | None, str | None]: + """ + List files from Google Drive with pagination. + + Args: + query: Search query (e.g., "mimeType != 'application/vnd.google-apps.folder'") + fields: Fields to retrieve + page_size: Number of files per page (max 1000) + page_token: Token for next page + + Returns: + Tuple of (files list, next_page_token, error message) + """ + try: + service = await self.get_service() + + params = { + "pageSize": min(page_size, 1000), + "fields": fields, + "supportsAllDrives": True, + "includeItemsFromAllDrives": True, + } + + if query: + params["q"] = query + if page_token: + params["pageToken"] = page_token + + result = service.files().list(**params).execute() + + files = result.get("files", []) + next_token = result.get("nextPageToken") + + return files, next_token, None + + except HttpError as e: + error_msg = f"HTTP error listing files: {e.resp.status} - {e.error_details}" + return [], None, error_msg + except Exception as e: + return [], None, f"Error listing files: {e!s}" + + async def get_file_metadata( + self, file_id: str, fields: str = "*" + ) -> tuple[dict[str, Any] | None, str | None]: + """ + Get metadata for a specific file. + + Args: + file_id: ID of the file + fields: Fields to retrieve + + Returns: + Tuple of (file metadata, error message) + """ + try: + service = await self.get_service() + file = service.files().get(fileId=file_id, fields=fields, supportsAllDrives=True).execute() + return file, None + except HttpError as e: + return None, f"HTTP error getting file metadata: {e.resp.status}" + except Exception as e: + return None, f"Error getting file metadata: {e!s}" + + async def download_file( + self, file_id: str + ) -> tuple[bytes | None, str | None]: + """ + Download binary file content. + + Args: + file_id: ID of the file to download + + Returns: + Tuple of (file content bytes, error message) + """ + try: + service = await self.get_service() + request = service.files().get_media(fileId=file_id) + + # Execute the download + import io + + fh = io.BytesIO() + from googleapiclient.http import MediaIoBaseDownload + + downloader = MediaIoBaseDownload(fh, request) + + done = False + while not done: + _, done = downloader.next_chunk() + + return fh.getvalue(), None + + except HttpError as e: + return None, f"HTTP error downloading file: {e.resp.status}" + except Exception as e: + return None, f"Error downloading file: {e!s}" + + async def export_google_file( + self, file_id: str, mime_type: str + ) -> tuple[bytes | None, str | None]: + """ + Export Google Workspace file to specified format. + + Args: + file_id: ID of the Google file + mime_type: Target MIME type (e.g., 'application/pdf', 'text/plain') + + Returns: + Tuple of (exported content as bytes, error message) + """ + try: + service = await self.get_service() + content = ( + service.files() + .export(fileId=file_id, mimeType=mime_type) + .execute() + ) + + # Content is already bytes from the API + # Keep as bytes to support both text and binary formats (like PDF) + if not isinstance(content, bytes): + content = content.encode("utf-8") + + return content, None + + except HttpError as e: + return None, f"HTTP error exporting file: {e.resp.status}" + except Exception as e: + return None, f"Error exporting file: {e!s}" + From 701c3409b386e8a85d725cef37664f95c39157b3 Mon Sep 17 00:00:00 2001 From: CREDO23 Date: Sun, 28 Dec 2025 15:54:42 +0200 Subject: [PATCH 06/39] feat(connectors): add Google Drive file type detection and mapping - Detect Google Workspace files (Docs, Sheets, Slides) - Map to PDF export format to preserve rich content (images, formatting) - Identify files to skip (shortcuts, unsupported types) --- .../app/connectors/google_drive/file_types.py | 37 +++++++++++++++++++ 1 file changed, 37 insertions(+) create mode 100644 surfsense_backend/app/connectors/google_drive/file_types.py diff --git a/surfsense_backend/app/connectors/google_drive/file_types.py b/surfsense_backend/app/connectors/google_drive/file_types.py new file mode 100644 index 000000000..f66680c6c --- /dev/null +++ b/surfsense_backend/app/connectors/google_drive/file_types.py @@ -0,0 +1,37 @@ +""" +File Type Handlers for Google Drive. + +Simple module for basic file type detection. +""" + +# Google Workspace MIME types that need export +GOOGLE_DOC = "application/vnd.google-apps.document" +GOOGLE_SHEET = "application/vnd.google-apps.spreadsheet" +GOOGLE_SLIDE = "application/vnd.google-apps.presentation" +GOOGLE_FOLDER = "application/vnd.google-apps.folder" +GOOGLE_SHORTCUT = "application/vnd.google-apps.shortcut" + +# Export MIME types for Google Workspace files +# Export as PDF to preserve formatting, images, and structure +EXPORT_FORMATS = { + GOOGLE_DOC: "application/pdf", + GOOGLE_SHEET: "application/pdf", + GOOGLE_SLIDE: "application/pdf", +} + + +def is_google_workspace_file(mime_type: str) -> bool: + """Check if file is a Google Workspace file that needs export.""" + return mime_type.startswith("application/vnd.google-apps") + + +def should_skip_file(mime_type: str) -> bool: + """Check if file should be skipped (folders, shortcuts, etc).""" + return mime_type in [GOOGLE_FOLDER, GOOGLE_SHORTCUT] + + +def get_export_mime_type(mime_type: str) -> str | None: + """Get export MIME type for Google Workspace files.""" + return EXPORT_FORMATS.get(mime_type) + + From 40304c6795b9ab669fb594ee140abf6d5ce2d41e Mon Sep 17 00:00:00 2001 From: CREDO23 Date: Sun, 28 Dec 2025 15:54:50 +0200 Subject: [PATCH 07/39] feat(connectors): add Google Drive content extraction using existing ETL - Download files from Google Drive to temporary location - Export Google Workspace files as PDF - Delegate content extraction to existing process_file_in_background - Reuse Surfsense's ETL services (Unstructured, LlamaCloud, Docling) --- .../google_drive/content_extractor.py | 122 ++++++++++++++++++ 1 file changed, 122 insertions(+) create mode 100644 surfsense_backend/app/connectors/google_drive/content_extractor.py diff --git a/surfsense_backend/app/connectors/google_drive/content_extractor.py b/surfsense_backend/app/connectors/google_drive/content_extractor.py new file mode 100644 index 000000000..82b8d42b3 --- /dev/null +++ b/surfsense_backend/app/connectors/google_drive/content_extractor.py @@ -0,0 +1,122 @@ +""" +Content Extraction for Google Drive Files. + +Downloads files and delegates to Surfsense's existing file processors. +""" + +import logging +import os +import tempfile +from pathlib import Path +from typing import Any + +from sqlalchemy.ext.asyncio import AsyncSession + +from app.db import Log +from app.services.task_logging_service import TaskLoggingService + +from .client import GoogleDriveClient +from .file_types import get_export_mime_type, is_google_workspace_file, should_skip_file + +logger = logging.getLogger(__name__) + + +async def download_and_process_file( + client: GoogleDriveClient, + file: dict[str, Any], + search_space_id: int, + user_id: str, + session: AsyncSession, + task_logger: TaskLoggingService, + log_entry: Log, +) -> tuple[Any, str | None]: + """ + Download Google Drive file and process using Surfsense's existing infrastructure. + + This is the ONLY function needed - it delegates everything to process_file_in_background. + + Args: + client: GoogleDriveClient instance + file: File metadata from Drive API + search_space_id: ID of the search space + user_id: ID of the user + session: Database session + task_logger: Task logging service + log_entry: Log entry for tracking + + Returns: + Tuple of (Document object if successful, error message if failed) + """ + file_id = file.get("id") + file_name = file.get("name", "Unknown") + mime_type = file.get("mimeType", "") + + # Skip folders and shortcuts + if should_skip_file(mime_type): + return None, f"Skipping {mime_type}" + + logger.info(f"Downloading file: {file_name} ({mime_type})") + + temp_file_path = None + try: + # Step 1: Download or export the file + if is_google_workspace_file(mime_type): + # Google Workspace files need export (as PDF to preserve formatting & images) + export_mime = get_export_mime_type(mime_type) + if not export_mime: + return None, f"Cannot export Google Workspace type: {mime_type}" + + logger.info(f"Exporting Google Workspace file as {export_mime}") + content_bytes, error = await client.export_google_file(file_id, export_mime) + if error: + return None, error + + # Set extension based on export format + extension = ".pdf" if export_mime == "application/pdf" else ".txt" + else: + # Regular files - download directly + content_bytes, error = await client.download_file(file_id) + if error: + return None, error + + # Preserve original file extension + extension = Path(file_name).suffix or ".bin" + + # Save to temporary file + with tempfile.NamedTemporaryFile(delete=False, suffix=extension) as tmp_file: + tmp_file.write(content_bytes) + temp_file_path = tmp_file.name + + # Step 2: Delegate to Surfsense's existing file processor + # This handles ALL file types: markdown, audio, PDFs, Office docs, images, etc. + from app.tasks.document_processors.file_processors import ( + process_file_in_background, + ) + + logger.info(f"Processing {file_name} with Surfsense's file processor") + result = await process_file_in_background( + file_path=temp_file_path, + filename=file_name, + search_space_id=search_space_id, + user_id=user_id, + session=session, + task_logger=task_logger, + log_entry=log_entry, + ) + + # process_file_in_background returns None on duplicate/error, Document on success + return result, None + + except Exception as e: + logger.warning(f"Failed to process {file_name}: {e!s}") + return None, str(e) + + finally: + # Cleanup temp file (if process_file_in_background didn't already delete it) + if temp_file_path and os.path.exists(temp_file_path): + try: + os.unlink(temp_file_path) + except Exception as e: + logger.debug(f"Could not delete temp file {temp_file_path}: {e}") + + From 84bde67979e82cd4010baa340506499a7d1830db Mon Sep 17 00:00:00 2001 From: CREDO23 Date: Sun, 28 Dec 2025 15:54:58 +0200 Subject: [PATCH 08/39] feat(connectors): add Google Drive folder browsing and file listing - List folder contents with full pagination support - Query root folder or specific parent folder - Return both folders and files with metadata (size, icons, links) - Filter out shortcuts and trashed items --- .../connectors/google_drive/folder_manager.py | 243 ++++++++++++++++++ 1 file changed, 243 insertions(+) create mode 100644 surfsense_backend/app/connectors/google_drive/folder_manager.py diff --git a/surfsense_backend/app/connectors/google_drive/folder_manager.py b/surfsense_backend/app/connectors/google_drive/folder_manager.py new file mode 100644 index 000000000..da9deb75d --- /dev/null +++ b/surfsense_backend/app/connectors/google_drive/folder_manager.py @@ -0,0 +1,243 @@ +""" +Folder Management for Google Drive. + +Handles folder listing, selection, and hierarchy operations. +Small, focused module for folder-related operations. +""" + +import logging +from typing import Any + +from .client import GoogleDriveClient + +logger = logging.getLogger(__name__) + + +async def list_folders( + client: GoogleDriveClient, + parent_id: str | None = None, +) -> tuple[list[dict[str, Any]], str | None]: + """ + List folders in Google Drive. + + Args: + client: GoogleDriveClient instance + parent_id: Parent folder ID (None for root) + + Returns: + Tuple of (folders list, error message) + """ + try: + # Build query to get only folders + query_parts = ["mimeType = 'application/vnd.google-apps.folder'", "trashed = false"] + + if parent_id: + query_parts.append(f"'{parent_id}' in parents") + + query = " and ".join(query_parts) + + folders, _, error = await client.list_files( + query=query, + fields="files(id, name, parents, createdTime, modifiedTime)", + page_size=100, + ) + + if error: + return [], error + + return folders, None + + except Exception as e: + logger.error(f"Error listing folders: {e!s}", exc_info=True) + return [], f"Error listing folders: {e!s}" + + +async def get_folder_hierarchy( + client: GoogleDriveClient, + folder_id: str, +) -> tuple[list[dict[str, str]], str | None]: + """ + Get the full path hierarchy for a folder. + + Args: + client: GoogleDriveClient instance + folder_id: Folder ID to get hierarchy for + + Returns: + Tuple of (hierarchy list [{'id': ..., 'name': ...}], error message) + """ + try: + hierarchy = [] + current_id = folder_id + + # Traverse up to root + while current_id: + file, error = await client.get_file_metadata( + current_id, + fields="id, name, parents, mimeType" + ) + + if error: + return [], error + + if not file: + break + + hierarchy.insert(0, {"id": file["id"], "name": file["name"]}) + + # Get parent + parents = file.get("parents", []) + current_id = parents[0] if parents else None + + return hierarchy, None + + except Exception as e: + logger.error(f"Error getting folder hierarchy: {e!s}", exc_info=True) + return [], f"Error getting folder hierarchy: {e!s}" + + +async def get_files_in_folder( + client: GoogleDriveClient, + folder_id: str, + include_subfolders: bool = True, + page_token: str | None = None, +) -> tuple[list[dict[str, Any]], str | None, str | None]: + """ + Get all indexable files in a folder. + + Args: + client: GoogleDriveClient instance + folder_id: Folder ID to search in + include_subfolders: Whether to include subfolders + page_token: Pagination token + + Returns: + Tuple of (files list, next_page_token, error message) + """ + try: + # Build query + query_parts = [ + f"'{folder_id}' in parents", + "trashed = false", + "mimeType != 'application/vnd.google-apps.shortcut'", # Skip shortcuts + ] + + if not include_subfolders: + query_parts.append("mimeType != 'application/vnd.google-apps.folder'") + + query = " and ".join(query_parts) + + files, next_token, error = await client.list_files( + query=query, + page_size=100, + page_token=page_token, + ) + + if error: + return [], None, error + + return files, next_token, None + + except Exception as e: + logger.error(f"Error getting files in folder: {e!s}", exc_info=True) + return [], None, f"Error getting files in folder: {e!s}" + + +def format_folder_path(hierarchy: list[dict[str, str]]) -> str: + """ + Format folder hierarchy as a path string. + + Args: + hierarchy: List of folder dicts with 'id' and 'name' + + Returns: + Formatted path (e.g., "My Drive / Projects / Documents") + """ + if not hierarchy: + return "My Drive" + + folder_names = [folder["name"] for folder in hierarchy] + return " / ".join(folder_names) + + +async def list_folder_contents( + client: GoogleDriveClient, + parent_id: str | None = None, +) -> tuple[list[dict[str, Any]], str | None]: + """ + List both folders and files in a Google Drive folder. + + Fetches ALL items using pagination (handles folders with >100 items). + Returns items sorted with folders first, then files. + Each item includes 'isFolder' boolean for frontend rendering. + + Args: + client: GoogleDriveClient instance + parent_id: Parent folder ID (None for root) + + Returns: + Tuple of (items list with folders and files, error message) + """ + try: + # Build query to get folders and files (exclude shortcuts) + query_parts = [ + "trashed = false", + "mimeType != 'application/vnd.google-apps.shortcut'", + ] + + # For root, we need to explicitly query for items in 'root' + # For subfolders, query for items with that parent + if parent_id: + query_parts.append(f"'{parent_id}' in parents") + else: + # Query for root-level items + query_parts.append("'root' in parents") + + query = " and ".join(query_parts) + + # Fetch all items with pagination (max 1000 per page) + all_items = [] + page_token = None + + while True: + items, next_token, error = await client.list_files( + query=query, + fields="files(id, name, mimeType, parents, createdTime, modifiedTime, size, webViewLink, iconLink)", + page_size=1000, # Max allowed by Google Drive API + page_token=page_token, + ) + + if error: + return [], error + + all_items.extend(items) + + # If no more pages, break + if not next_token: + break + + page_token = next_token + + # Add 'isFolder' flag and sort (folders first, then files) + for item in all_items: + item["isFolder"] = item["mimeType"] == "application/vnd.google-apps.folder" + + # Sort: folders first (alphabetically), then files (alphabetically) + all_items.sort(key=lambda x: (not x["isFolder"], x["name"].lower())) + + # Count folders and files for logging + folder_count = sum(1 for item in all_items if item["isFolder"]) + file_count = len(all_items) - folder_count + + logger.info( + f"Listed {len(all_items)} items ({folder_count} folders, {file_count} files) " + + (f"in folder {parent_id}" if parent_id else "in root (My Drive)") + ) + + return all_items, None + + except Exception as e: + logger.error(f"Error listing folder contents: {e!s}", exc_info=True) + return [], f"Error listing folder contents: {e!s}" + + From 3e67d5f31ec9792c5a063f2ebcc7172b3c2fc57a Mon Sep 17 00:00:00 2001 From: CREDO23 Date: Sun, 28 Dec 2025 15:55:06 +0200 Subject: [PATCH 09/39] feat(connectors): add Google Drive delta sync with change tracking - Get start page token for change tracking baseline - Fetch incremental changes using Google Drive Changes API - Categorize changes into added, modified, and removed files - Enable efficient re-indexing of only changed content --- .../connectors/google_drive/change_tracker.py | 213 ++++++++++++++++++ 1 file changed, 213 insertions(+) create mode 100644 surfsense_backend/app/connectors/google_drive/change_tracker.py diff --git a/surfsense_backend/app/connectors/google_drive/change_tracker.py b/surfsense_backend/app/connectors/google_drive/change_tracker.py new file mode 100644 index 000000000..1c697af5f --- /dev/null +++ b/surfsense_backend/app/connectors/google_drive/change_tracker.py @@ -0,0 +1,213 @@ +""" +Change Tracking for Google Drive - Delta Sync Support. + +Handles change detection and incremental syncing using Drive API's changes endpoint. +Small, focused module for tracking file modifications. +""" + +import logging +from datetime import datetime +from typing import Any + +from .client import GoogleDriveClient + +logger = logging.getLogger(__name__) + + +async def get_start_page_token( + client: GoogleDriveClient, +) -> tuple[str | None, str | None]: + """ + Get the starting page token for change tracking. + + This token represents the current state and is used for future delta syncs. + + Args: + client: GoogleDriveClient instance + + Returns: + Tuple of (start_page_token, error message) + """ + try: + service = await client.get_service() + response = service.changes().getStartPageToken(supportsAllDrives=True).execute() + token = response.get("startPageToken") + + logger.info(f"Got start page token: {token}") + return token, None + + except Exception as e: + logger.error(f"Error getting start page token: {e!s}", exc_info=True) + return None, f"Error getting start page token: {e!s}" + + +async def get_changes( + client: GoogleDriveClient, + page_token: str, + folder_id: str | None = None, +) -> tuple[list[dict[str, Any]], str | None, str | None]: + """ + Get list of changes since the given page token. + + Args: + client: GoogleDriveClient instance + page_token: Page token from previous sync + folder_id: Optional folder ID to filter changes + + Returns: + Tuple of (changes list, new_page_token, error message) + """ + try: + service = await client.get_service() + + params = { + "pageToken": page_token, + "pageSize": 100, + "fields": "nextPageToken, newStartPageToken, changes(fileId, removed, file(id, name, mimeType, modifiedTime, size, webViewLink, parents, trashed))", + "supportsAllDrives": True, + "includeItemsFromAllDrives": True, + } + + response = service.changes().list(**params).execute() + + changes = response.get("changes", []) + next_token = response.get("nextPageToken") + new_start_token = response.get("newStartPageToken") + + # Use new start token if this is the last page + token_to_return = new_start_token if new_start_token else next_token + + # Filter changes by folder if specified + if folder_id: + changes = await _filter_changes_by_folder(client, changes, folder_id) + + logger.info(f"Got {len(changes)} changes, next token: {token_to_return}") + return changes, token_to_return, None + + except Exception as e: + logger.error(f"Error getting changes: {e!s}", exc_info=True) + return [], None, f"Error getting changes: {e!s}" + + +async def _filter_changes_by_folder( + client: GoogleDriveClient, + changes: list[dict[str, Any]], + folder_id: str, +) -> list[dict[str, Any]]: + """ + Filter changes to only include files within the specified folder. + + Args: + client: GoogleDriveClient instance + changes: List of changes from API + folder_id: Folder ID to filter by + + Returns: + Filtered list of changes + """ + filtered = [] + + for change in changes: + file = change.get("file") + if not file: + # File was removed + filtered.append(change) + continue + + # Check if file is in the folder (or subfolder) + parents = file.get("parents", []) + if folder_id in parents: + filtered.append(change) + else: + # Check if any parent is a descendant of folder_id + # This is a simplified check - full implementation would traverse hierarchy + # For now, we'll include it and let indexer validate + filtered.append(change) + + return filtered + + +def categorize_change(change: dict[str, Any]) -> str: + """ + Categorize a change event. + + Args: + change: Change event from Drive API + + Returns: + Category: 'removed', 'trashed', 'modified', 'new' + """ + if change.get("removed"): + return "removed" + + file = change.get("file") + if not file: + return "removed" + + if file.get("trashed"): + return "trashed" + + # Check if file was recently created + created_time = file.get("createdTime") + modified_time = file.get("modifiedTime") + + if created_time and modified_time: + try: + created = datetime.fromisoformat(created_time.replace("Z", "+00:00")) + modified = datetime.fromisoformat(modified_time.replace("Z", "+00:00")) + + # If created and modified times are very close, it's likely a new file + time_diff = abs((modified - created).total_seconds()) + if time_diff < 60: # Within 1 minute + return "new" + except Exception: + pass + + return "modified" + + +async def fetch_all_changes( + client: GoogleDriveClient, + start_token: str, + folder_id: str | None = None, +) -> tuple[list[dict[str, Any]], str | None, str | None]: + """ + Fetch all changes from start token, handling pagination. + + Args: + client: GoogleDriveClient instance + start_token: Starting page token + folder_id: Optional folder ID to filter changes + + Returns: + Tuple of (all changes, final_page_token, error message) + """ + all_changes = [] + current_token = start_token + error = None + + try: + while current_token: + changes, next_token, err = await get_changes( + client, current_token, folder_id + ) + + if err: + error = err + break + + all_changes.extend(changes) + + # If next_token is None, we've reached the end + if not next_token or next_token == current_token: + break + + current_token = next_token + + logger.info(f"Fetched total of {len(all_changes)} changes") + return all_changes, current_token, error + + except Exception as e: + logger.error(f"Error fetching all changes: {e!s}", exc_info=True) + return all_changes, current_token, f"Error fetching all changes: {e!s}" + From bf02005d82ddb5c8329176b5469492535753c5f7 Mon Sep 17 00:00:00 2001 From: CREDO23 Date: Sun, 28 Dec 2025 15:55:13 +0200 Subject: [PATCH 10/39] feat(routes): add Google Drive OAuth and folder listing endpoints - OAuth initialization and callback handling - Folder and file browsing with parent_id support - Validate credentials and handle token refresh - Return folder contents with metadata for UI tree view --- .../google_drive_add_connector_route.py | 315 ++++++++++++++++++ 1 file changed, 315 insertions(+) create mode 100644 surfsense_backend/app/routes/google_drive_add_connector_route.py diff --git a/surfsense_backend/app/routes/google_drive_add_connector_route.py b/surfsense_backend/app/routes/google_drive_add_connector_route.py new file mode 100644 index 000000000..d11404781 --- /dev/null +++ b/surfsense_backend/app/routes/google_drive_add_connector_route.py @@ -0,0 +1,315 @@ +""" +Google Drive Connector OAuth Routes. + +Handles OAuth 2.0 authentication flow for Google Drive connector. +Folder selection happens at index time on the manage connector page. + +Endpoints: +- GET /auth/google/drive/connector/add - Initiate OAuth +- GET /auth/google/drive/connector/callback - Handle OAuth callback +- GET /connectors/{connector_id}/google-drive/folders - List user's folders (for index-time selection) +""" + +import base64 +import json +import logging +import os +from uuid import UUID + +from fastapi import APIRouter, Depends, HTTPException, Request +from fastapi.responses import RedirectResponse +from google_auth_oauthlib.flow import Flow +from pydantic import ValidationError +from sqlalchemy.exc import IntegrityError +from sqlalchemy.ext.asyncio import AsyncSession +from sqlalchemy.future import select + +from app.config import config +from app.connectors.google_drive import ( + GoogleDriveClient, + get_start_page_token, + get_valid_credentials, + list_folder_contents, +) +from app.connectors.google_drive.folder_manager import list_folders +from app.db import ( + SearchSourceConnector, + SearchSourceConnectorType, + User, + get_async_session, +) +from app.users import current_active_user + +# Relax token scope validation for Google OAuth +os.environ["OAUTHLIB_RELAX_TOKEN_SCOPE"] = "1" + +logger = logging.getLogger(__name__) +router = APIRouter() + +# Google Drive OAuth scopes +SCOPES = [ + "https://www.googleapis.com/auth/drive.readonly", # Read-only access to Drive + "https://www.googleapis.com/auth/userinfo.email", # User email + "https://www.googleapis.com/auth/userinfo.profile", # User profile + "openid", +] + + +def get_google_flow(): + """Create and return a Google OAuth flow for Drive API.""" + try: + return Flow.from_client_config( + { + "web": { + "client_id": config.GOOGLE_OAUTH_CLIENT_ID, + "client_secret": config.GOOGLE_OAUTH_CLIENT_SECRET, + "auth_uri": "https://accounts.google.com/o/oauth2/auth", + "token_uri": "https://oauth2.googleapis.com/token", + "redirect_uris": [config.GOOGLE_DRIVE_REDIRECT_URI], + } + }, + scopes=SCOPES, + redirect_uri=config.GOOGLE_DRIVE_REDIRECT_URI, + ) + except Exception as e: + raise HTTPException( + status_code=500, detail=f"Failed to create Google OAuth flow: {e!s}" + ) from e + + +@router.get("/auth/google/drive/connector/add") +async def connect_drive(space_id: int, user: User = Depends(current_active_user)): + """ + Initiate Google Drive OAuth flow. + + Query params: + space_id: Search space ID to add connector to + + Returns: + JSON with auth_url to redirect user to Google authorization + """ + try: + if not space_id: + raise HTTPException(status_code=400, detail="space_id is required") + + flow = get_google_flow() + + # Encode space_id and user_id in state parameter + state_payload = json.dumps( + { + "space_id": space_id, + "user_id": str(user.id), + } + ) + state_encoded = base64.urlsafe_b64encode(state_payload.encode()).decode() + + # Generate authorization URL + auth_url, _ = flow.authorization_url( + access_type="offline", # Get refresh token + prompt="consent", # Force consent screen to get refresh token + include_granted_scopes="true", + state=state_encoded, + ) + + logger.info(f"Initiating Google Drive OAuth for user {user.id}, space {space_id}") + return {"auth_url": auth_url} + + except Exception as e: + logger.error(f"Failed to initiate Google Drive OAuth: {e!s}", exc_info=True) + raise HTTPException( + status_code=500, detail=f"Failed to initiate Google OAuth: {e!s}" + ) from e + + +@router.get("/auth/google/drive/connector/callback") +async def drive_callback( + request: Request, + code: str, + state: str, + session: AsyncSession = Depends(get_async_session), +): + """ + Handle Google Drive OAuth callback. + + Query params: + code: Authorization code from Google + state: Encoded state with space_id and user_id + + Returns: + Redirect to frontend success page + """ + try: + # Decode and parse state + decoded_state = base64.urlsafe_b64decode(state.encode()).decode() + data = json.loads(decoded_state) + + user_id = UUID(data["user_id"]) + space_id = data["space_id"] + + logger.info(f"Processing Google Drive callback for user {user_id}, space {space_id}") + + # Exchange authorization code for tokens + flow = get_google_flow() + flow.fetch_token(code=code) + + creds = flow.credentials + creds_dict = json.loads(creds.to_json()) + + # Check if connector already exists for this space/user + result = await session.execute( + select(SearchSourceConnector).filter( + SearchSourceConnector.search_space_id == space_id, + SearchSourceConnector.user_id == user_id, + SearchSourceConnector.connector_type + == SearchSourceConnectorType.GOOGLE_DRIVE_CONNECTOR, + ) + ) + existing_connector = result.scalars().first() + + if existing_connector: + raise HTTPException( + status_code=409, + detail="A GOOGLE_DRIVE_CONNECTOR already exists in this search space. Each search space can have only one connector of each type per user.", + ) + + # Create new connector (NO folder selection here - happens at index time) + db_connector = SearchSourceConnector( + name="Google Drive Connector", + connector_type=SearchSourceConnectorType.GOOGLE_DRIVE_CONNECTOR, + config={ + **creds_dict, + "start_page_token": None, # Will be set on first index + }, + search_space_id=space_id, + user_id=user_id, + is_indexable=True, + ) + + session.add(db_connector) + await session.commit() + await session.refresh(db_connector) + + # Get initial start page token for delta sync + try: + drive_client = GoogleDriveClient(session, db_connector.id) + start_token, token_error = await get_start_page_token(drive_client) + + if start_token and not token_error: + db_connector.config["start_page_token"] = start_token + from sqlalchemy.orm.attributes import flag_modified + + flag_modified(db_connector, "config") + await session.commit() + logger.info(f"Set initial start page token for connector {db_connector.id}") + except Exception as e: + logger.warning(f"Failed to get initial start page token: {e!s}") + + logger.info( + f"Successfully created Google Drive connector {db_connector.id} for user {user_id}" + ) + + # Redirect to connectors management page (not to folder selection) + return RedirectResponse( + url=f"{config.NEXT_FRONTEND_URL}/dashboard/{space_id}/connectors?success=google-drive-connected" + ) + + except HTTPException: + await session.rollback() + raise + except ValidationError as e: + await session.rollback() + logger.error(f"Validation error: {e!s}", exc_info=True) + raise HTTPException( + status_code=400, detail=f"Invalid connector configuration: {e!s}" + ) from e + except IntegrityError as e: + await session.rollback() + logger.error(f"Database integrity error: {e!s}", exc_info=True) + raise HTTPException( + status_code=409, + detail="A connector with this configuration already exists.", + ) from e + except Exception as e: + await session.rollback() + logger.error(f"Unexpected error in Drive callback: {e!s}", exc_info=True) + raise HTTPException( + status_code=500, detail=f"Failed to complete Google OAuth: {e!s}" + ) from e + + +@router.get("/connectors/{connector_id}/google-drive/folders") +async def list_google_drive_folders( + connector_id: int, + parent_id: str | None = None, + session: AsyncSession = Depends(get_async_session), + user: User = Depends(current_active_user), +): + """ + List folders AND files in user's Google Drive with hierarchical support. + + This is called at index time from the manage connector page to display + the complete file system (folders and files). Only folders are selectable. + + Args: + connector_id: ID of the Google Drive connector + parent_id: Optional parent folder ID to list contents (None for root) + + Returns: + JSON with list of items: { + "items": [ + {"id": str, "name": str, "mimeType": str, "isFolder": bool, ...}, + ... + ] + } + """ + try: + # Get connector and verify ownership + result = await session.execute( + select(SearchSourceConnector).filter( + SearchSourceConnector.id == connector_id, + SearchSourceConnector.user_id == user.id, + SearchSourceConnector.connector_type + == SearchSourceConnectorType.GOOGLE_DRIVE_CONNECTOR, + ) + ) + connector = result.scalars().first() + + if not connector: + raise HTTPException( + status_code=404, + detail="Google Drive connector not found or access denied", + ) + + # Initialize Drive client (credentials will be loaded on first API call) + drive_client = GoogleDriveClient(session, connector_id) + + # List both folders and files (sorted: folders first) + items, error = await list_folder_contents(drive_client, parent_id=parent_id) + + if error: + raise HTTPException( + status_code=500, detail=f"Failed to list folder contents: {error}" + ) + + # Count folders and files for better logging + folder_count = sum(1 for item in items if item.get("isFolder", False)) + file_count = len(items) - folder_count + + logger.info( + f"✅ Listed {len(items)} total items ({folder_count} folders, {file_count} files) for connector {connector_id}" + + (f" in folder {parent_id}" if parent_id else " in ROOT") + ) + + # Log first few items for debugging + if items: + logger.info(f"First 3 items: {[item.get('name') for item in items[:3]]}") + + return {"items": items} + + except HTTPException: + raise + except Exception as e: + logger.error(f"Error listing Drive contents: {e!s}", exc_info=True) + raise HTTPException( + status_code=500, detail=f"Failed to list Drive contents: {e!s}" + ) from e From 1696c7056a8e448ca7bec7c7f00bf046a3e54e26 Mon Sep 17 00:00:00 2001 From: CREDO23 Date: Sun, 28 Dec 2025 15:55:25 +0200 Subject: [PATCH 11/39] feat(indexer): add Google Drive folder indexing with delta sync - Full folder scan on first index - Delta sync using change tracking for subsequent indexes - Process files in parallel batches - Handle file additions, modifications, and deletions - Store change tracking token for efficient re-indexing --- .../google_drive_indexer.py | 448 ++++++++++++++++++ 1 file changed, 448 insertions(+) create mode 100644 surfsense_backend/app/tasks/connector_indexers/google_drive_indexer.py diff --git a/surfsense_backend/app/tasks/connector_indexers/google_drive_indexer.py b/surfsense_backend/app/tasks/connector_indexers/google_drive_indexer.py new file mode 100644 index 000000000..9c4d446de --- /dev/null +++ b/surfsense_backend/app/tasks/connector_indexers/google_drive_indexer.py @@ -0,0 +1,448 @@ +""" +Google Drive Indexer - Delegates all processing to Surfsense's file processors. + +Handles: +- Folder-specific indexing (user selects folder) +- Delta sync (only index changed files) +- Delegates file processing to process_file_in_background +""" + +import logging +from datetime import datetime + +from sqlalchemy.exc import SQLAlchemyError +from sqlalchemy.ext.asyncio import AsyncSession + +from app.connectors.google_drive import ( + GoogleDriveClient, + categorize_change, + download_and_process_file, + fetch_all_changes, + get_files_in_folder, + get_start_page_token, +) +from app.db import DocumentType, SearchSourceConnectorType +from app.services.task_logging_service import TaskLoggingService +from app.tasks.connector_indexers.base import ( + check_document_by_unique_identifier, + get_connector_by_id, + update_connector_last_indexed, +) +from app.utils.document_converters import generate_unique_identifier_hash + +logger = logging.getLogger(__name__) + + +async def index_google_drive_files( + session: AsyncSession, + connector_id: int, + search_space_id: int, + user_id: str, + folder_id: str | None = None, + folder_name: str | None = None, + use_delta_sync: bool = True, + update_last_indexed: bool = True, + max_files: int = 500, +) -> tuple[int, str | None]: + """ + Index Google Drive files for a specific connector. + + Args: + session: Database session + connector_id: ID of the Drive connector + search_space_id: ID of the search space + user_id: ID of the user + folder_id: Specific folder to index (from UI/request, takes precedence) + folder_name: Folder name for display (from UI/request) + use_delta_sync: Whether to use change tracking for incremental sync + update_last_indexed: Whether to update last_indexed_at timestamp + max_files: Maximum number of files to index + + Returns: + Tuple of (number_of_indexed_files, error_message) + """ + task_logger = TaskLoggingService(session, search_space_id) + + # Log task start + log_entry = await task_logger.log_task_start( + task_name="google_drive_files_indexing", + source="connector_indexing_task", + message=f"Starting Google Drive indexing for connector {connector_id}", + metadata={ + "connector_id": connector_id, + "user_id": str(user_id), + "folder_id": folder_id, + "use_delta_sync": use_delta_sync, + "max_files": max_files, + }, + ) + + try: + # Get connector from database + connector = await get_connector_by_id( + session, connector_id, SearchSourceConnectorType.GOOGLE_DRIVE_CONNECTOR + ) + + if not connector: + error_msg = f"Google Drive connector with ID {connector_id} not found" + await task_logger.log_task_failure( + log_entry, error_msg, {"error_type": "ConnectorNotFound"} + ) + return 0, error_msg + + # Initialize Drive client + await task_logger.log_task_progress( + log_entry, + f"Initializing Google Drive client for connector {connector_id}", + {"stage": "client_initialization"}, + ) + + drive_client = GoogleDriveClient(session, connector_id) + + # Use folder from request params (required for Google Drive) + if not folder_id: + error_msg = "folder_id is required for Google Drive indexing" + await task_logger.log_task_failure( + log_entry, error_msg, {"error_type": "MissingParameter"} + ) + return 0, error_msg + + target_folder_id = folder_id + target_folder_name = folder_name or "Selected Folder" + + logger.info(f"Indexing Google Drive folder: {target_folder_name} ({target_folder_id})") + + # Decide sync strategy + start_page_token = connector.config.get("start_page_token") + can_use_delta_sync = use_delta_sync and start_page_token and connector.last_indexed_at + + if can_use_delta_sync: + logger.info(f"Using delta sync for connector {connector_id}") + result = await _index_with_delta_sync( + drive_client=drive_client, + session=session, + connector=connector, + connector_id=connector_id, + search_space_id=search_space_id, + user_id=user_id, + folder_id=target_folder_id, + start_page_token=start_page_token, + task_logger=task_logger, + log_entry=log_entry, + max_files=max_files, + ) + else: + logger.info(f"Using full scan for connector {connector_id}") + result = await _index_full_scan( + drive_client=drive_client, + session=session, + connector=connector, + connector_id=connector_id, + search_space_id=search_space_id, + user_id=user_id, + folder_id=target_folder_id, + folder_name=target_folder_name, + task_logger=task_logger, + log_entry=log_entry, + max_files=max_files, + ) + + documents_indexed, documents_skipped = result + + # Update last indexed timestamp and get new start page token + if documents_indexed > 0 or can_use_delta_sync: + # Get new start page token for next sync + new_token, token_error = await get_start_page_token(drive_client) + if new_token and not token_error: + from sqlalchemy.orm.attributes import flag_modified + + connector.config["start_page_token"] = new_token + flag_modified(connector, "config") + + await update_connector_last_indexed(session, connector, update_last_indexed) + + # Final commit + await session.commit() + logger.info( + f"Successfully committed Google Drive indexing changes to database" + ) + + # Log success + await task_logger.log_task_success( + log_entry, + f"Successfully completed Google Drive indexing for connector {connector_id}", + { + "files_processed": documents_indexed, + "files_skipped": documents_skipped, + "sync_type": "delta" if can_use_delta_sync else "full", + "folder": target_folder_name, + }, + ) + + logger.info( + f"Google Drive indexing completed: {documents_indexed} files indexed, {documents_skipped} skipped" + ) + return documents_indexed, None + + except SQLAlchemyError as db_error: + await session.rollback() + await task_logger.log_task_failure( + log_entry, + f"Database error during Google Drive indexing for connector {connector_id}", + str(db_error), + {"error_type": "SQLAlchemyError"}, + ) + logger.error(f"Database error: {db_error!s}", exc_info=True) + return 0, f"Database error: {db_error!s}" + except Exception as e: + await session.rollback() + await task_logger.log_task_failure( + log_entry, + f"Failed to index Google Drive files for connector {connector_id}", + str(e), + {"error_type": type(e).__name__}, + ) + logger.error(f"Failed to index Google Drive files: {e!s}", exc_info=True) + return 0, f"Failed to index Google Drive files: {e!s}" + + +async def _index_full_scan( + drive_client: GoogleDriveClient, + session: AsyncSession, + connector: any, + connector_id: int, + search_space_id: int, + user_id: str, + folder_id: str | None, + folder_name: str, + task_logger: TaskLoggingService, + log_entry: any, + max_files: int, +) -> tuple[int, int]: + """Perform full scan indexing of a folder.""" + await task_logger.log_task_progress( + log_entry, + f"Starting full scan of folder: {folder_name}", + {"stage": "full_scan", "folder_id": folder_id}, + ) + + documents_indexed = 0 + documents_skipped = 0 + page_token = None + files_processed = 0 + + # Paginate through all files in folder + while files_processed < max_files: + files, next_token, error = await get_files_in_folder( + drive_client, folder_id, include_subfolders=False, page_token=page_token + ) + + if error: + logger.error(f"Error listing files: {error}") + break + + if not files: + break + + for file in files: + if files_processed >= max_files: + break + + files_processed += 1 + + # Process file + indexed, skipped = await _process_single_file( + drive_client=drive_client, + session=session, + file=file, + connector_id=connector_id, + search_space_id=search_space_id, + user_id=user_id, + task_logger=task_logger, + log_entry=log_entry, + ) + + documents_indexed += indexed + documents_skipped += skipped + + # Batch commit every 10 files + if documents_indexed % 10 == 0 and documents_indexed > 0: + await session.commit() + logger.info(f"Committed batch: {documents_indexed} files indexed so far") + + page_token = next_token + if not page_token: + break + + logger.info( + f"Full scan complete: {documents_indexed} indexed, {documents_skipped} skipped" + ) + return documents_indexed, documents_skipped + + +async def _index_with_delta_sync( + drive_client: GoogleDriveClient, + session: AsyncSession, + connector: any, + connector_id: int, + search_space_id: int, + user_id: str, + folder_id: str | None, + start_page_token: str, + task_logger: TaskLoggingService, + log_entry: any, + max_files: int, +) -> tuple[int, int]: + """Perform delta sync indexing using change tracking.""" + await task_logger.log_task_progress( + log_entry, + f"Starting delta sync from token: {start_page_token[:20]}...", + {"stage": "delta_sync", "start_token": start_page_token}, + ) + + # Fetch all changes since last sync + changes, final_token, error = await fetch_all_changes( + drive_client, start_page_token, folder_id + ) + + if error: + logger.error(f"Error fetching changes: {error}") + return 0, 0 + + if not changes: + logger.info("No changes detected since last sync") + return 0, 0 + + logger.info(f"Processing {len(changes)} changes") + + documents_indexed = 0 + documents_skipped = 0 + files_processed = 0 + + for change in changes: + if files_processed >= max_files: + break + + files_processed += 1 + change_type = categorize_change(change) + + # Handle removed/trashed files + if change_type in ["removed", "trashed"]: + file_id = change.get("fileId") + if file_id: + await _remove_document(session, file_id, search_space_id) + continue + + # Handle modified/new files + file = change.get("file") + if not file: + continue + + indexed, skipped = await _process_single_file( + drive_client=drive_client, + session=session, + file=file, + connector_id=connector_id, + search_space_id=search_space_id, + user_id=user_id, + task_logger=task_logger, + log_entry=log_entry, + ) + + documents_indexed += indexed + documents_skipped += skipped + + # Batch commit every 10 files + if documents_indexed % 10 == 0 and documents_indexed > 0: + await session.commit() + logger.info(f"Committed batch: {documents_indexed} changes processed") + + logger.info( + f"Delta sync complete: {documents_indexed} indexed, {documents_skipped} skipped" + ) + return documents_indexed, documents_skipped + + +async def _process_single_file( + drive_client: GoogleDriveClient, + session: AsyncSession, + file: dict, + connector_id: int, + search_space_id: int, + user_id: str, + task_logger: TaskLoggingService, + log_entry: any, +) -> tuple[int, int]: + """ + Process a single file by downloading and using Surfsense's file processor. + + Returns: + Tuple of (indexed_count, skipped_count) + """ + file_name = file.get("name", "Unknown") + mime_type = file.get("mimeType", "") + + try: + logger.info(f"Processing file: {file_name} ({mime_type})") + + # Download and process using Surfsense's existing infrastructure + # This handles: markdown, audio, PDFs, Office docs, images, etc. + # It also handles: deduplication, chunking, summarization, embedding + document, error = await download_and_process_file( + client=drive_client, + file=file, + search_space_id=search_space_id, + user_id=user_id, + session=session, + task_logger=task_logger, + log_entry=log_entry, + ) + + if error: + # Log and skip - not an error, just unsupported or empty + await task_logger.log_task_progress( + log_entry, + f"Skipped {file_name}: {error}", + {"status": "skipped", "reason": error}, + ) + return 0, 1 + + if document: + # Successfully indexed + await task_logger.log_task_progress( + log_entry, + f"Successfully indexed: {file_name}", + { + "status": "indexed", + "document_id": document.id, + "file_name": file_name, + }, + ) + return 1, 0 + else: + # Likely a duplicate or unsupported type + logger.info(f"No document created for {file_name} (duplicate or unsupported)") + return 0, 1 + + except Exception as e: + logger.error(f"Error processing file {file_name}: {e!s}", exc_info=True) + return 0, 1 + + +async def _remove_document( + session: AsyncSession, file_id: str, search_space_id: int +): + """Remove a document that was deleted in Drive.""" + unique_identifier_hash = generate_unique_identifier_hash( + DocumentType.GOOGLE_DRIVE_CONNECTOR, file_id, search_space_id + ) + + existing_document = await check_document_by_unique_identifier( + session, unique_identifier_hash + ) + + if existing_document: + await session.delete(existing_document) + logger.info(f"Removed deleted file document: {file_id}") + + From 501d08f2f4b52d939a6adede37b7f6bb96ce1326 Mon Sep 17 00:00:00 2001 From: CREDO23 Date: Sun, 28 Dec 2025 15:55:38 +0200 Subject: [PATCH 12/39] feat(routes): register Google Drive OAuth router --- surfsense_backend/app/routes/__init__.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/surfsense_backend/app/routes/__init__.py b/surfsense_backend/app/routes/__init__.py index a055bf549..24751e596 100644 --- a/surfsense_backend/app/routes/__init__.py +++ b/surfsense_backend/app/routes/__init__.py @@ -11,6 +11,9 @@ from .google_calendar_add_connector_route import ( from .google_gmail_add_connector_route import ( router as google_gmail_add_connector_router, ) +from .google_drive_add_connector_route import ( + router as google_drive_add_connector_router, +) from .logs_routes import router as logs_router from .luma_add_connector_route import router as luma_add_connector_router from .new_chat_routes import router as new_chat_router @@ -33,6 +36,7 @@ router.include_router(podcasts_router) # Podcast task status and audio router.include_router(search_source_connectors_router) router.include_router(google_calendar_add_connector_router) router.include_router(google_gmail_add_connector_router) +router.include_router(google_drive_add_connector_router) router.include_router(airtable_add_connector_router) router.include_router(luma_add_connector_router) router.include_router(new_llm_config_router) # LLM configs with prompt configuration From 7b8900d51f119c9c0549eec37f6a8756aeda8221 Mon Sep 17 00:00:00 2001 From: CREDO23 Date: Sun, 28 Dec 2025 15:55:46 +0200 Subject: [PATCH 13/39] feat(indexer): export Google Drive indexer function --- surfsense_backend/app/tasks/connector_indexers/__init__.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/surfsense_backend/app/tasks/connector_indexers/__init__.py b/surfsense_backend/app/tasks/connector_indexers/__init__.py index dcfca33c3..80a9eaf19 100644 --- a/surfsense_backend/app/tasks/connector_indexers/__init__.py +++ b/surfsense_backend/app/tasks/connector_indexers/__init__.py @@ -35,6 +35,7 @@ from .elasticsearch_indexer import index_elasticsearch_documents from .github_indexer import index_github_repos from .google_calendar_indexer import index_google_calendar_events from .google_gmail_indexer import index_google_gmail_messages +from .google_drive_indexer import index_google_drive_files from .jira_indexer import index_jira_issues # Issue tracking and project management @@ -57,6 +58,7 @@ __all__ = [ # noqa: RUF022 "index_github_repos", # Calendar and scheduling "index_google_calendar_events", + "index_google_drive_files", "index_luma_events", "index_jira_issues", # Issue tracking and project management From 358abdf02f4124d99c280e5ee019874f582bf62b Mon Sep 17 00:00:00 2001 From: CREDO23 Date: Sun, 28 Dec 2025 15:55:57 +0200 Subject: [PATCH 14/39] feat(routes): add Google Drive indexing support with folder selection - Accept folder_id and folder_name as indexing parameters - Hide date range for Google Drive connectors - Create wrapper function to avoid circular imports - Trigger Google Drive indexing Celery task --- .../routes/search_source_connectors_routes.py | 72 +++++++++++++++++++ 1 file changed, 72 insertions(+) diff --git a/surfsense_backend/app/routes/search_source_connectors_routes.py b/surfsense_backend/app/routes/search_source_connectors_routes.py index 5a7db7f37..d530163f4 100644 --- a/surfsense_backend/app/routes/search_source_connectors_routes.py +++ b/surfsense_backend/app/routes/search_source_connectors_routes.py @@ -45,6 +45,7 @@ from app.tasks.connector_indexers import ( index_github_repos, index_google_calendar_events, index_google_gmail_messages, + index_google_drive_files, index_jira_issues, index_linear_issues, index_luma_events, @@ -542,6 +543,14 @@ async def index_connector_content( None, description="End date for indexing (YYYY-MM-DD format). If not provided, uses today's date", ), + folder_id: str = Query( + None, + description="[Google Drive only] Folder ID to index. If not provided, uses the connector's saved selected_folder_id", + ), + folder_name: str = Query( + None, + description="[Google Drive only] Folder name for display purposes", + ), session: AsyncSession = Depends(get_async_session), user: User = Depends(current_active_user), ): @@ -747,6 +756,25 @@ async def index_connector_content( ) response_message = "Google Gmail indexing started in the background." + elif ( + connector.connector_type == SearchSourceConnectorType.GOOGLE_DRIVE_CONNECTOR + ): + from app.tasks.celery_tasks.connector_tasks import ( + index_google_drive_files_task, + ) + + logger.info( + f"Triggering Google Drive indexing for connector {connector_id} into search space {search_space_id}, folder: {folder_name or 'default'}" + ) + index_google_drive_files_task.delay( + connector_id, + search_space_id, + str(user.id), + folder_id, + folder_name, + ) + response_message = "Google Drive indexing started in the background." + elif connector.connector_type == SearchSourceConnectorType.DISCORD_CONNECTOR: from app.tasks.celery_tasks.connector_tasks import ( index_discord_messages_task, @@ -1515,6 +1543,50 @@ async def run_google_gmail_indexing( # Optionally update status in DB to indicate failure +async def run_google_drive_indexing( + session: AsyncSession, + connector_id: int, + search_space_id: int, + user_id: str, + folder_id: str, + folder_name: str, +): + """Runs the Google Drive indexing task and updates the timestamp.""" + try: + from app.tasks.connector_indexers.google_drive_indexer import ( + index_google_drive_files, + ) + + indexed_count, error_message = await index_google_drive_files( + session, + connector_id, + search_space_id, + user_id, + folder_id, + folder_name, + use_delta_sync=True, + update_last_indexed=False, + ) + if error_message: + logger.error( + f"Google Drive indexing failed for connector {connector_id}: {error_message}" + ) + # Optionally update status in DB to indicate failure + else: + logger.info( + f"Google Drive indexing successful for connector {connector_id}. Indexed {indexed_count} documents." + ) + # Update the last indexed timestamp only on success + await update_connector_last_indexed(session, connector_id) + await session.commit() # Commit timestamp update + except Exception as e: + logger.error( + f"Critical error in run_google_drive_indexing for connector {connector_id}: {e}", + exc_info=True, + ) + # Optionally update status in DB to indicate failure + + # Add new helper functions for luma indexing async def run_luma_indexing_with_new_session( connector_id: int, From 1c83327fc7dc6c3272c27503e61269cbf543d463 Mon Sep 17 00:00:00 2001 From: CREDO23 Date: Sun, 28 Dec 2025 15:56:11 +0200 Subject: [PATCH 15/39] feat(celery): add Google Drive indexing Celery task - Create async task for Google Drive folder indexing - Accept folder_id and folder_name parameters - Call indexing wrapper to avoid circular imports --- .../app/tasks/celery_tasks/connector_tasks.py | 52 +++++++++++++++++++ 1 file changed, 52 insertions(+) diff --git a/surfsense_backend/app/tasks/celery_tasks/connector_tasks.py b/surfsense_backend/app/tasks/celery_tasks/connector_tasks.py index 6cd557dc4..8e507915f 100644 --- a/surfsense_backend/app/tasks/celery_tasks/connector_tasks.py +++ b/surfsense_backend/app/tasks/celery_tasks/connector_tasks.py @@ -473,6 +473,58 @@ async def _index_google_gmail_messages( ) +@celery_app.task(name="index_google_drive_files", bind=True) +def index_google_drive_files_task( + self, + connector_id: int, + search_space_id: int, + user_id: str, + folder_id: str, + folder_name: str, +): + """Celery task to index Google Drive files.""" + import asyncio + + loop = asyncio.new_event_loop() + asyncio.set_event_loop(loop) + + try: + loop.run_until_complete( + _index_google_drive_files( + connector_id, + search_space_id, + user_id, + folder_id, + folder_name, + ) + ) + finally: + loop.close() + + +async def _index_google_drive_files( + connector_id: int, + search_space_id: int, + user_id: str, + folder_id: str, + folder_name: str, +): + """Index Google Drive files with new session.""" + from app.routes.search_source_connectors_routes import ( + run_google_drive_indexing, + ) + + async with get_celery_session_maker()() as session: + await run_google_drive_indexing( + session, + connector_id, + search_space_id, + user_id, + folder_id, + folder_name, + ) + + @celery_app.task(name="index_discord_messages", bind=True) def index_discord_messages_task( self, From 2d24f9ac7921d4c8cc1f3296e43c27b303ca1e3d Mon Sep 17 00:00:00 2001 From: CREDO23 Date: Sun, 28 Dec 2025 15:56:30 +0200 Subject: [PATCH 16/39] feat(types): add GOOGLE_DRIVE_CONNECTOR to frontend enum --- surfsense_web/contracts/enums/connector.ts | 1 + 1 file changed, 1 insertion(+) diff --git a/surfsense_web/contracts/enums/connector.ts b/surfsense_web/contracts/enums/connector.ts index 6cdbc5656..eb2cf7ad8 100644 --- a/surfsense_web/contracts/enums/connector.ts +++ b/surfsense_web/contracts/enums/connector.ts @@ -14,6 +14,7 @@ export enum EnumConnectorName { CLICKUP_CONNECTOR = "CLICKUP_CONNECTOR", GOOGLE_CALENDAR_CONNECTOR = "GOOGLE_CALENDAR_CONNECTOR", GOOGLE_GMAIL_CONNECTOR = "GOOGLE_GMAIL_CONNECTOR", + GOOGLE_DRIVE_CONNECTOR = "GOOGLE_DRIVE_CONNECTOR", AIRTABLE_CONNECTOR = "AIRTABLE_CONNECTOR", LUMA_CONNECTOR = "LUMA_CONNECTOR", ELASTICSEARCH_CONNECTOR = "ELASTICSEARCH_CONNECTOR", From 11d94e0ea6ed8a5146001c2c228674aa2071b30d Mon Sep 17 00:00:00 2001 From: CREDO23 Date: Sun, 28 Dec 2025 15:56:36 +0200 Subject: [PATCH 17/39] feat(ui): add Google Drive icon to connector icons mapping --- surfsense_web/contracts/enums/connectorIcons.tsx | 3 +++ 1 file changed, 3 insertions(+) diff --git a/surfsense_web/contracts/enums/connectorIcons.tsx b/surfsense_web/contracts/enums/connectorIcons.tsx index 87840d7e4..661be5253 100644 --- a/surfsense_web/contracts/enums/connectorIcons.tsx +++ b/surfsense_web/contracts/enums/connectorIcons.tsx @@ -26,6 +26,7 @@ import { Sparkles, Telescope, Webhook, + HardDrive, } from "lucide-react"; import { EnumConnectorName } from "./connector"; @@ -57,6 +58,8 @@ export const getConnectorIcon = (connectorType: EnumConnectorName | string, clas return ; case EnumConnectorName.GOOGLE_GMAIL_CONNECTOR: return ; + case EnumConnectorName.GOOGLE_DRIVE_CONNECTOR: + return ; case EnumConnectorName.AIRTABLE_CONNECTOR: return ; case EnumConnectorName.CONFLUENCE_CONNECTOR: From bfbd813f4297605522b665cb532731739447dee0 Mon Sep 17 00:00:00 2001 From: CREDO23 Date: Sun, 28 Dec 2025 15:56:42 +0200 Subject: [PATCH 18/39] feat(i18n): add Google Drive connector translation keys --- surfsense_web/messages/en.json | 1 + surfsense_web/messages/zh.json | 1 + 2 files changed, 2 insertions(+) diff --git a/surfsense_web/messages/en.json b/surfsense_web/messages/en.json index eac362b9c..f70c854e0 100644 --- a/surfsense_web/messages/en.json +++ b/surfsense_web/messages/en.json @@ -303,6 +303,7 @@ "luma_desc": "Connect to Luma to search events, meetups and gatherings.", "calendar_desc": "Connect to Google Calendar to search events, meetings and schedules.", "gmail_desc": "Connect to your Gmail account to search through your emails.", + "google_drive_desc": "Connect to Google Drive to search and index your files and documents.", "zoom_desc": "Connect to Zoom to access meeting recordings and transcripts.", "webcrawler_desc": "Crawl and index content from any public web pages." }, diff --git a/surfsense_web/messages/zh.json b/surfsense_web/messages/zh.json index b943a3c2c..483a10a10 100644 --- a/surfsense_web/messages/zh.json +++ b/surfsense_web/messages/zh.json @@ -303,6 +303,7 @@ "luma_desc": "连接到 Luma 以搜索活动、聚会和集会。", "calendar_desc": "连接到 Google 日历以搜索活动、会议和日程。", "gmail_desc": "连接到您的 Gmail 账户以搜索您的电子邮件。", + "google_drive_desc": "连接到 Google 云端硬盘以搜索和索引您的文件和文档。", "zoom_desc": "连接到 Zoom 以访问会议录制和转录。", "webcrawler_desc": "爬取和索引任何公开网页的内容。" }, From 48112f66df4096b6b44f898e11ec01d18f175e7c Mon Sep 17 00:00:00 2001 From: CREDO23 Date: Sun, 28 Dec 2025 15:56:52 +0200 Subject: [PATCH 19/39] feat(ui): add Google Drive connector card to Productivity category --- surfsense_web/components/sources/connector-data.tsx | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/surfsense_web/components/sources/connector-data.tsx b/surfsense_web/components/sources/connector-data.tsx index 338c3ae20..7fca3e6b9 100644 --- a/surfsense_web/components/sources/connector-data.tsx +++ b/surfsense_web/components/sources/connector-data.tsx @@ -183,6 +183,13 @@ export const connectorCategories: ConnectorCategory[] = [ icon: getConnectorIcon(EnumConnectorName.GOOGLE_GMAIL_CONNECTOR, "h-6 w-6"), status: "available", }, + { + id: "google-drive-connector", + title: "Google Drive", + description: "google_drive_desc", + icon: getConnectorIcon(EnumConnectorName.GOOGLE_DRIVE_CONNECTOR, "h-6 w-6"), + status: "available", + }, { id: "luma-connector", title: "Luma", From 90b3474b47d9d34e8182b0adda2251faee8feaed Mon Sep 17 00:00:00 2001 From: CREDO23 Date: Sun, 28 Dec 2025 15:57:02 +0200 Subject: [PATCH 20/39] feat(hooks): add folder parameters to indexConnector function - Accept folderId and folderName for Google Drive indexing - Pass folder parameters to backend API --- surfsense_web/hooks/use-search-source-connectors.ts | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/surfsense_web/hooks/use-search-source-connectors.ts b/surfsense_web/hooks/use-search-source-connectors.ts index 2f77d7d82..ee8ce5518 100644 --- a/surfsense_web/hooks/use-search-source-connectors.ts +++ b/surfsense_web/hooks/use-search-source-connectors.ts @@ -267,7 +267,9 @@ export const useSearchSourceConnectors = (lazy: boolean = false, searchSpaceId?: connectorId: number, searchSpaceId: string | number, startDate?: string, - endDate?: string + endDate?: string, + folderId?: string, + folderName?: string ) => { try { // Build query parameters @@ -280,6 +282,12 @@ export const useSearchSourceConnectors = (lazy: boolean = false, searchSpaceId?: if (endDate) { params.append("end_date", endDate); } + if (folderId) { + params.append("folder_id", folderId); + } + if (folderName) { + params.append("folder_name", folderName); + } const response = await authenticatedFetch( `${ From ad4d424d3815b35335c703975dedd561ceb7aadb Mon Sep 17 00:00:00 2001 From: CREDO23 Date: Sun, 28 Dec 2025 15:57:10 +0200 Subject: [PATCH 21/39] feat(ui): add Google Drive OAuth connection page - Handle OAuth flow similar to Gmail/Calendar - Show connection status and redirect to manage page - Display connector features and file type support - No folder selection at connection time (done at index time) --- .../add/google-drive-connector/page.tsx | 218 ++++++++++++++++++ 1 file changed, 218 insertions(+) create mode 100644 surfsense_web/app/dashboard/[search_space_id]/connectors/add/google-drive-connector/page.tsx diff --git a/surfsense_web/app/dashboard/[search_space_id]/connectors/add/google-drive-connector/page.tsx b/surfsense_web/app/dashboard/[search_space_id]/connectors/add/google-drive-connector/page.tsx new file mode 100644 index 000000000..b9fb8d953 --- /dev/null +++ b/surfsense_web/app/dashboard/[search_space_id]/connectors/add/google-drive-connector/page.tsx @@ -0,0 +1,218 @@ +"use client"; + +import { ArrowLeft, Check, ExternalLink, Loader2 } from "lucide-react"; +import { motion } from "motion/react"; +import Link from "next/link"; +import { useParams, useRouter, useSearchParams } from "next/navigation"; +import { useEffect, useState } from "react"; +import { toast } from "sonner"; +import { Button } from "@/components/ui/button"; +import { + Card, + CardContent, + CardDescription, + CardFooter, + CardHeader, + CardTitle, +} from "@/components/ui/card"; +import { EnumConnectorName } from "@/contracts/enums/connector"; +import { getConnectorIcon } from "@/contracts/enums/connectorIcons"; +import { + type SearchSourceConnector, + useSearchSourceConnectors, +} from "@/hooks/use-search-source-connectors"; +import { authenticatedFetch } from "@/lib/auth-utils"; + +export default function GoogleDriveConnectorPage() { + const router = useRouter(); + const params = useParams(); + const searchParams = useSearchParams(); + const searchSpaceId = params.search_space_id as string; + + const [isConnecting, setIsConnecting] = useState(false); + const [doesConnectorExist, setDoesConnectorExist] = useState(false); + + const { fetchConnectors } = useSearchSourceConnectors(true, Number.parseInt(searchSpaceId)); + + // Check if connector exists and handle OAuth success + useEffect(() => { + const success = searchParams.get("success"); + + fetchConnectors(Number.parseInt(searchSpaceId)).then((data) => { + const driveConnector = data.find( + (c: SearchSourceConnector) => c.connector_type === EnumConnectorName.GOOGLE_DRIVE_CONNECTOR + ); + + if (driveConnector) { + setDoesConnectorExist(true); + + // If just connected, show success and redirect + if (success === "true") { + toast.success("Google Drive connected successfully!"); + setTimeout(() => { + router.push(`/dashboard/${searchSpaceId}/connectors`); + }, 1500); + } + } + }); + }, [searchParams, fetchConnectors, searchSpaceId, router]); + + const handleConnectGoogle = async () => { + try { + setIsConnecting(true); + const response = await authenticatedFetch( + `${process.env.NEXT_PUBLIC_FASTAPI_BACKEND_URL}/api/v1/auth/google/drive/connector/add/?space_id=${searchSpaceId}`, + { method: "GET" } + ); + + if (!response.ok) { + throw new Error("Failed to initiate Google OAuth"); + } + + const data = await response.json(); + window.location.href = data.auth_url; + } catch (error) { + console.error("Error connecting to Google:", error); + toast.error("Failed to connect to Google Drive"); + } finally { + setIsConnecting(false); + } + }; + + return ( +
+ + {/* Header */} +
+ + + Back to connectors + +
+
+ {getConnectorIcon(EnumConnectorName.GOOGLE_DRIVE_CONNECTOR, "h-6 w-6")} +
+
+

Connect Google Drive

+

+ Securely connect your Google Drive account +

+
+
+
+ + {/* Connection Card */} + {!doesConnectorExist ? ( + + + Connect Your Google Account + + Authorize read-only access to your Google Drive. You'll select which folder to + index when you start indexing. + + + +
+ + Read-only access to your Drive files +
+
+ + Index documents, spreadsheets, presentations, PDFs & more +
+
+ + Automatic updates with change tracking +
+
+ + Secure OAuth 2.0 authentication +
+
+ + + + +
+ ) : ( + + + ✅ Already Connected + + Your Google Drive connector is already set up. Go to the connectors page to + start indexing. + + + + + + + )} + + {/* Information Card */} + + + How Google Drive Integration Works + + +
+

1️⃣ Connect Your Account

+

+ First, securely connect your Google Drive account using OAuth 2.0. We only + request read-only access. +

+
+
+

2️⃣ Select Folder to Index

+

+ When you're ready to index, go to the connectors page and click "Index". You'll + choose which folder to process. +

+
+
+

3️⃣ Automatic Change Detection

+

+ We use Google Drive's change tracking API to detect when files are modified, + added, or deleted. Only changed files are re-indexed. +

+
+
+

📄 Comprehensive File Support

+

+ Supports Google Workspace files (Docs, Sheets, Slides), Microsoft Office + documents, PDFs, text files, images (with OCR), and more. +

+
+
+
+
+
+ ); +} From 5df04c3caa54573723c0a0158cebf6e6a4d2647c Mon Sep 17 00:00:00 2001 From: CREDO23 Date: Sun, 28 Dec 2025 15:57:18 +0200 Subject: [PATCH 22/39] feat(ui): add hierarchical Google Drive folder tree browser - Display folders and files with lazy loading - Show different icons for file types (docs, sheets, slides, etc) - Expandable folder tree with proper indentation - Selectable folders for indexing - Handle overflow with proper truncation - Full pagination support for large folder structures --- .../connectors/google-drive-folder-tree.tsx | 340 ++++++++++++++++++ 1 file changed, 340 insertions(+) create mode 100644 surfsense_web/components/connectors/google-drive-folder-tree.tsx diff --git a/surfsense_web/components/connectors/google-drive-folder-tree.tsx b/surfsense_web/components/connectors/google-drive-folder-tree.tsx new file mode 100644 index 000000000..22ef97556 --- /dev/null +++ b/surfsense_web/components/connectors/google-drive-folder-tree.tsx @@ -0,0 +1,340 @@ +"use client"; + +import { + ChevronDown, + ChevronRight, + File, + FileText, + Folder, + FolderOpen, + HardDrive, + Image, + Loader2, + Sheet, + Presentation, +} from "lucide-react"; +import { useState } from "react"; +import { Button } from "@/components/ui/button"; +import { ScrollArea } from "@/components/ui/scroll-area"; +import { cn } from "@/lib/utils"; +import { authenticatedFetch } from "@/lib/auth-utils"; + +interface DriveItem { + id: string; + name: string; + mimeType: string; + isFolder: boolean; + parents?: string[]; + size?: number; + iconLink?: string; +} + +interface ItemTreeNode { + item: DriveItem; + children: DriveItem[] | null; // null = not loaded, [] = loaded but empty + isExpanded: boolean; + isLoading: boolean; +} + +interface GoogleDriveFolderTreeProps { + connectorId: number; + selectedFolderId: string | null; + onSelectFolder: (folderId: string, folderName: string) => void; +} + +// Helper to get appropriate icon for file type +function getFileIcon(mimeType: string, className: string = "h-4 w-4") { + if (mimeType.includes("spreadsheet") || mimeType.includes("excel")) { + return ; + } + if (mimeType.includes("presentation") || mimeType.includes("powerpoint")) { + return ; + } + if (mimeType.includes("document") || mimeType.includes("word") || mimeType.includes("text")) { + return ; + } + if (mimeType.includes("image")) { + return ; + } + return ; +} + +// Helper to format file size +function formatFileSize(bytes: number | undefined): string { + if (!bytes) return ""; + if (bytes < 1024) return `${bytes} B`; + if (bytes < 1024 * 1024) return `${(bytes / 1024).toFixed(1)} KB`; + if (bytes < 1024 * 1024 * 1024) return `${(bytes / (1024 * 1024)).toFixed(1)} MB`; + return `${(bytes / (1024 * 1024 * 1024)).toFixed(1)} GB`; +} + +export function GoogleDriveFolderTree({ + connectorId, + selectedFolderId, + onSelectFolder, +}: GoogleDriveFolderTreeProps) { + const [rootItems, setRootItems] = useState([]); + const [itemStates, setItemStates] = useState>(new Map()); + const [isLoadingRoot, setIsLoadingRoot] = useState(false); + const [isInitialized, setIsInitialized] = useState(false); + + // Load root items (folders and files) on mount + const loadRootItems = async () => { + if (isInitialized) return; // Already loaded + + setIsLoadingRoot(true); + try { + const response = await authenticatedFetch( + `${process.env.NEXT_PUBLIC_FASTAPI_BACKEND_URL}/api/v1/connectors/${connectorId}/google-drive/folders` + ); + if (!response.ok) throw new Error("Failed to load items"); + + const data = await response.json(); + setRootItems(data.items || []); + setIsInitialized(true); + } catch (error) { + console.error("Error loading root items:", error); + } finally { + setIsLoadingRoot(false); + } + }; + + // Helper function to find an item recursively through all loaded items + const findItem = (itemId: string): DriveItem | undefined => { + // First check if we have it in itemStates + const state = itemStates.get(itemId); + if (state?.item) return state.item; + + // Check root items + const rootItem = rootItems.find((item) => item.id === itemId); + if (rootItem) return rootItem; + + // Recursively search through all loaded children + for (const [, nodeState] of itemStates) { + if (nodeState.children) { + const found = nodeState.children.find((child) => child.id === itemId); + if (found) return found; + } + } + + return undefined; + }; + + // Load children (folders and files) for a specific folder + const loadFolderContents = async (folderId: string) => { + try { + // Set loading state + setItemStates((prev) => { + const newMap = new Map(prev); + const existing = newMap.get(folderId); + if (existing) { + newMap.set(folderId, { ...existing, isLoading: true }); + } else { + // First time loading this folder - create initial state + const item = findItem(folderId); + if (item) { + newMap.set(folderId, { + item, + children: null, + isExpanded: false, + isLoading: true, + }); + } + } + return newMap; + }); + + const response = await authenticatedFetch( + `${process.env.NEXT_PUBLIC_FASTAPI_BACKEND_URL}/api/v1/connectors/${connectorId}/google-drive/folders?parent_id=${folderId}` + ); + if (!response.ok) throw new Error("Failed to load folder contents"); + + const data = await response.json(); + const items = data.items || []; + + // Check if folder only contains files (no subfolders) + const hasSubfolders = items.some((item: DriveItem) => item.isFolder); + + // Update item state with loaded children + setItemStates((prev) => { + const newMap = new Map(prev); + const existing = newMap.get(folderId); + const item = existing?.item || findItem(folderId); + + if (item) { + newMap.set(folderId, { + item, + children: items, + isExpanded: true, // Always expand after loading + isLoading: false, + }); + } else { + console.error(`Could not find item for folderId: ${folderId}`); + } + return newMap; + }); + } catch (error) { + console.error("Error loading folder contents:", error); + // Clear loading state on error + setItemStates((prev) => { + const newMap = new Map(prev); + const existing = newMap.get(folderId); + if (existing) { + newMap.set(folderId, { ...existing, isLoading: false }); + } + return newMap; + }); + } + }; + + // Toggle folder expansion + const toggleFolder = async (item: DriveItem) => { + if (!item.isFolder) return; // Only folders can be expanded + + const state = itemStates.get(item.id); + + if (!state || state.children === null) { + // First time expanding - load children + await loadFolderContents(item.id); + } else { + // Toggle expansion state + setItemStates((prev) => { + const newMap = new Map(prev); + newMap.set(item.id, { + ...state, + isExpanded: !state.isExpanded, + }); + return newMap; + }); + } + }; + + // Recursive render function for item tree + const renderItem = (item: DriveItem, level: number = 0) => { + const state = itemStates.get(item.id); + const isExpanded = state?.isExpanded || false; + const isLoading = state?.isLoading || false; + const children = state?.children; + const isSelected = selectedFolderId === item.id; + const isFolder = item.isFolder; + + // Separate folders and files for children + const childFolders = children?.filter((c) => c.isFolder) || []; + const childFiles = children?.filter((c) => !c.isFolder) || []; + + return ( +
+ + + {/* Render children if expanded (folders first, then files) */} + {isExpanded && isFolder && children && ( +
+ {/* Render folders first */} + {childFolders.map((child) => renderItem(child, level + 1))} + + {/* Render files */} + {childFiles.map((child) => renderItem(child, level + 1))} + + {/* Empty state */} + {children.length === 0 && ( +
+ Empty folder +
+ )} +
+ )} +
+ ); + }; + + // Initialize on first render + if (!isInitialized && !isLoadingRoot) { + loadRootItems(); + } + + return ( +
+ +
+ {/* My Drive Header (always visible, selectable) */} +
+ +
+ + {/* Loading indicator */} + {isLoadingRoot && ( +
+ +
+ )} + + {/* Root items (folders and files) - same level as Google Drive shows */} +
+ {!isLoadingRoot && rootItems.map((item) => renderItem(item, 0))} +
+ + {/* Empty state */} + {!isLoadingRoot && rootItems.length === 0 && ( +
+ No files or folders found in your Google Drive +
+ )} +
+
+
+ ); +} From c4a95ecc024ca9ef8b0f0705bb4200a7279d9aa4 Mon Sep 17 00:00:00 2001 From: CREDO23 Date: Sun, 28 Dec 2025 15:57:26 +0200 Subject: [PATCH 23/39] feat(ui): integrate Google Drive folder selection into manage connectors page - Add folder selection dialog for Google Drive indexing - Hide date picker and quick index for Google Drive - Show folder tree browser in modal - Pass selected folder to indexing API - Adjust modal size to prevent overflow --- .../connectors/(manage)/page.tsx | 215 ++++++++++++++++-- 1 file changed, 190 insertions(+), 25 deletions(-) diff --git a/surfsense_web/app/dashboard/[search_space_id]/connectors/(manage)/page.tsx b/surfsense_web/app/dashboard/[search_space_id]/connectors/(manage)/page.tsx index e2f219448..fd1f7da1d 100644 --- a/surfsense_web/app/dashboard/[search_space_id]/connectors/(manage)/page.tsx +++ b/surfsense_web/app/dashboard/[search_space_id]/connectors/(manage)/page.tsx @@ -5,6 +5,8 @@ import { Calendar as CalendarIcon, Clock, Edit, + Folder, + HardDrive, Loader2, Plus, RefreshCw, @@ -61,6 +63,13 @@ import { EnumConnectorName } from "@/contracts/enums/connector"; import { getConnectorIcon } from "@/contracts/enums/connectorIcons"; import { useSearchSourceConnectors } from "@/hooks/use-search-source-connectors"; import { cn } from "@/lib/utils"; +import { authenticatedFetch } from "@/lib/auth-utils"; +import { GoogleDriveFolderTree } from "@/components/connectors/google-drive-folder-tree"; + +interface DriveFolder { + id: string; + name: string; +} export default function ConnectorsPage() { const t = useTranslations("connectors"); @@ -105,6 +114,13 @@ export default function ConnectorsPage() { const [customFrequency, setCustomFrequency] = useState(""); const [isSavingPeriodic, setIsSavingPeriodic] = useState(false); + // Google Drive folder selection state + const [driveFolderDialogOpen, setDriveFolderDialogOpen] = useState(false); + const [driveFolders, setDriveFolders] = useState([]); + const [selectedFolderId, setSelectedFolderId] = useState(""); + const [selectedFolderName, setSelectedFolderName] = useState(""); + const [isLoadingFolders, setIsLoadingFolders] = useState(false); + useEffect(() => { if (error) { toast.error(t("failed_load")); @@ -129,8 +145,78 @@ export default function ConnectorsPage() { // Handle opening date picker for indexing const handleOpenDatePicker = (connectorId: number) => { + // Check if this is a Google Drive connector + const connector = connectors.find((c) => c.id === connectorId); + if (connector?.connector_type === EnumConnectorName.GOOGLE_DRIVE_CONNECTOR) { + // Open folder selection dialog for Google Drive + handleOpenDriveFolderDialog(connectorId); + } else { + // Open date picker for other connectors + setSelectedConnectorForIndexing(connectorId); + setDatePickerOpen(true); + } + }; + + // Handle opening Google Drive folder selection dialog + const handleOpenDriveFolderDialog = async (connectorId: number) => { setSelectedConnectorForIndexing(connectorId); - setDatePickerOpen(true); + setDriveFolderDialogOpen(true); + setIsLoadingFolders(true); + + try { + const response = await authenticatedFetch( + `${process.env.NEXT_PUBLIC_FASTAPI_BACKEND_URL}/api/v1/connectors/${connectorId}/google-drive/folders`, + { method: "GET" } + ); + + if (!response.ok) { + throw new Error("Failed to load folders"); + } + + const data = await response.json(); + setDriveFolders(data.folders || []); + } catch (error) { + console.error("Error loading folders:", error); + toast.error("Failed to load Google Drive folders"); + setDriveFolderDialogOpen(false); + } finally { + setIsLoadingFolders(false); + } + }; + + // Handle Google Drive folder indexing + const handleIndexDriveFolder = async () => { + if (selectedConnectorForIndexing === null || !selectedFolderId) { + toast.error("Please select a folder"); + return; + } + + setDriveFolderDialogOpen(false); + + try { + setIndexingConnectorId(selectedConnectorForIndexing); + const selectedFolder = driveFolders.find((f) => f.id === selectedFolderId); + const folderName = selectedFolder?.name || "Selected Folder"; + + // Call indexConnector with folder_id and folder_name as query params + await indexConnector( + selectedConnectorForIndexing, + searchSpaceId, + undefined, + undefined, + selectedFolderId, + folderName + ); + toast.success(t("indexing_started")); + } catch (error) { + console.error("Error indexing connector content:", error); + toast.error(error instanceof Error ? error.message : t("indexing_failed")); + } finally { + setIndexingConnectorId(null); + setSelectedConnectorForIndexing(null); + setSelectedFolderId(""); + setDriveFolders([]); + } }; // Handle connector indexing with dates @@ -361,39 +447,52 @@ export default function ConnectorsPage() { > {indexingConnectorId === connector.id ? ( + ) : connector.connector_type === EnumConnectorName.GOOGLE_DRIVE_CONNECTOR ? ( + ) : ( )} - {t("index_date_range")} + + {connector.connector_type === EnumConnectorName.GOOGLE_DRIVE_CONNECTOR + ? "Select folder to index" + : t("index_date_range")} + -

{t("index_date_range")}

-
- - - - - - - - -

{t("quick_index_auto")}

+

+ {connector.connector_type === EnumConnectorName.GOOGLE_DRIVE_CONNECTOR + ? "Select folder to index" + : t("index_date_range")} +

+ {/* Hide quick index button for Google Drive (requires folder selection) */} + {connector.connector_type !== EnumConnectorName.GOOGLE_DRIVE_CONNECTOR && ( + + + + + + +

{t("quick_index_auto")}

+
+
+
+ )} )} {connector.is_indexable && ( @@ -581,6 +680,72 @@ export default function ConnectorsPage() { + {/* Google Drive Folder Selection Dialog */} + + + + Select Google Drive Folder + + Browse and select a folder to index. Click folders to expand and see subfolders. + + +
+
+ + {selectedConnectorForIndexing && ( + { + setSelectedFolderId(folderId); + setSelectedFolderName(folderName); + }} + /> + )} +

+ Changes to files in this folder will be automatically detected and re-indexed. +

+
+ {selectedFolderId && selectedFolderName && ( +
+
+

Selected folder:

+

+ {selectedFolderName} +

+
+
+

What will be indexed:

+
    +
  • Google Docs, Sheets, Slides (as PDFs)
  • +
  • PDFs, Word, Excel, PowerPoint files
  • +
  • Text files, markdown, code files
  • +
  • Images (with OCR if enabled)
  • +
+
+
+ )} +
+ + + + +
+
+ {/* Periodic Indexing Configuration Dialog */} From e0edfef5fcce0d40e09505dd871a4f44bf7dad4a Mon Sep 17 00:00:00 2001 From: CREDO23 Date: Sun, 28 Dec 2025 16:48:34 +0200 Subject: [PATCH 24/39] feat(ui): add multiple folder selection with checkboxes to Google Drive tree - Replace single folder selection with multi-select checkboxes - Remove cascading auto-select for clearer UX - Each folder must be selected individually - Visual indicators for selected folders --- .../connectors/google-drive-folder-tree.tsx | 118 +++++++++++------- 1 file changed, 72 insertions(+), 46 deletions(-) diff --git a/surfsense_web/components/connectors/google-drive-folder-tree.tsx b/surfsense_web/components/connectors/google-drive-folder-tree.tsx index 22ef97556..793fdc750 100644 --- a/surfsense_web/components/connectors/google-drive-folder-tree.tsx +++ b/surfsense_web/components/connectors/google-drive-folder-tree.tsx @@ -15,6 +15,7 @@ import { } from "lucide-react"; import { useState } from "react"; import { Button } from "@/components/ui/button"; +import { Checkbox } from "@/components/ui/checkbox"; import { ScrollArea } from "@/components/ui/scroll-area"; import { cn } from "@/lib/utils"; import { authenticatedFetch } from "@/lib/auth-utils"; @@ -36,10 +37,15 @@ interface ItemTreeNode { isLoading: boolean; } +interface SelectedFolder { + id: string; + name: string; +} + interface GoogleDriveFolderTreeProps { connectorId: number; - selectedFolderId: string | null; - onSelectFolder: (folderId: string, folderName: string) => void; + selectedFolders: SelectedFolder[]; + onSelectFolders: (folders: SelectedFolder[]) => void; } // Helper to get appropriate icon for file type @@ -59,25 +65,32 @@ function getFileIcon(mimeType: string, className: string = "h-4 w-4") { return ; } -// Helper to format file size -function formatFileSize(bytes: number | undefined): string { - if (!bytes) return ""; - if (bytes < 1024) return `${bytes} B`; - if (bytes < 1024 * 1024) return `${(bytes / 1024).toFixed(1)} KB`; - if (bytes < 1024 * 1024 * 1024) return `${(bytes / (1024 * 1024)).toFixed(1)} MB`; - return `${(bytes / (1024 * 1024 * 1024)).toFixed(1)} GB`; -} - export function GoogleDriveFolderTree({ connectorId, - selectedFolderId, - onSelectFolder, + selectedFolders, + onSelectFolders, }: GoogleDriveFolderTreeProps) { const [rootItems, setRootItems] = useState([]); const [itemStates, setItemStates] = useState>(new Map()); const [isLoadingRoot, setIsLoadingRoot] = useState(false); const [isInitialized, setIsInitialized] = useState(false); + // Helper to check if a folder is selected + const isFolderSelected = (folderId: string): boolean => { + return selectedFolders.some((f) => f.id === folderId); + }; + + // Handle folder checkbox toggle + const toggleFolderSelection = (folderId: string, folderName: string) => { + if (isFolderSelected(folderId)) { + // Remove from selection + onSelectFolders(selectedFolders.filter((f) => f.id !== folderId)); + } else { + // Add to selection + onSelectFolders([...selectedFolders, { id: folderId, name: folderName }]); + } + }; + // Load root items (folders and files) on mount const loadRootItems = async () => { if (isInitialized) return; // Already loaded @@ -215,7 +228,7 @@ export function GoogleDriveFolderTree({ const isExpanded = state?.isExpanded || false; const isLoading = state?.isLoading || false; const children = state?.children; - const isSelected = selectedFolderId === item.id; + const isSelected = isFolderSelected(item.id); const isFolder = item.isFolder; // Separate folders and files for children @@ -224,15 +237,13 @@ export function GoogleDriveFolderTree({ return (
- + isFolder && toggleFolder(item)} + > + {item.name} + +
{/* Render children if expanded (folders first, then files) */} {isExpanded && isFolder && children && (
{/* Render folders first */} {childFolders.map((child) => renderItem(child, level + 1))} - + {/* Render files */} {childFiles.map((child) => renderItem(child, level + 1))} - + {/* Empty state */} {children.length === 0 && ( -
- Empty folder -
+
Empty folder
)}
)} @@ -302,17 +328,17 @@ export function GoogleDriveFolderTree({
{/* My Drive Header (always visible, selectable) */}
- + toggleFolderSelection("root", "My Drive")}> + My Drive + +
{/* Loading indicator */} From 27a4bcdfc20466f936c0e4a3cf608264aa89b0f4 Mon Sep 17 00:00:00 2001 From: CREDO23 Date: Sun, 28 Dec 2025 16:48:56 +0200 Subject: [PATCH 25/39] feat(ui): support multiple folder selection in Google Drive indexing - Update manage page to handle array of selected folders - Add info icon with clear description about folder-level indexing - Display list of all selected folders before indexing - Remove unnecessary file type details section - Pass comma-separated folder IDs and names to backend --- .../connectors/(manage)/page.tsx | 128 +++++++++--------- 1 file changed, 61 insertions(+), 67 deletions(-) diff --git a/surfsense_web/app/dashboard/[search_space_id]/connectors/(manage)/page.tsx b/surfsense_web/app/dashboard/[search_space_id]/connectors/(manage)/page.tsx index fd1f7da1d..bbbfd61e0 100644 --- a/surfsense_web/app/dashboard/[search_space_id]/connectors/(manage)/page.tsx +++ b/surfsense_web/app/dashboard/[search_space_id]/connectors/(manage)/page.tsx @@ -7,6 +7,7 @@ import { Edit, Folder, HardDrive, + Info, Loader2, Plus, RefreshCw, @@ -117,8 +118,7 @@ export default function ConnectorsPage() { // Google Drive folder selection state const [driveFolderDialogOpen, setDriveFolderDialogOpen] = useState(false); const [driveFolders, setDriveFolders] = useState([]); - const [selectedFolderId, setSelectedFolderId] = useState(""); - const [selectedFolderName, setSelectedFolderName] = useState(""); + const [selectedFolders, setSelectedFolders] = useState>([]); const [isLoadingFolders, setIsLoadingFolders] = useState(false); useEffect(() => { @@ -186,8 +186,8 @@ export default function ConnectorsPage() { // Handle Google Drive folder indexing const handleIndexDriveFolder = async () => { - if (selectedConnectorForIndexing === null || !selectedFolderId) { - toast.error("Please select a folder"); + if (selectedConnectorForIndexing === null || selectedFolders.length === 0) { + toast.error("Please select at least one folder"); return; } @@ -195,28 +195,26 @@ export default function ConnectorsPage() { try { setIndexingConnectorId(selectedConnectorForIndexing); - const selectedFolder = driveFolders.find((f) => f.id === selectedFolderId); - const folderName = selectedFolder?.name || "Selected Folder"; - // Call indexConnector with folder_id and folder_name as query params + // Call indexConnector with folder_ids and folder_names as query params await indexConnector( selectedConnectorForIndexing, searchSpaceId, undefined, undefined, - selectedFolderId, - folderName + selectedFolders.map((f) => f.id).join(","), + selectedFolders.map((f) => f.name).join(", ") ); toast.success(t("indexing_started")); } catch (error) { console.error("Error indexing connector content:", error); toast.error(error instanceof Error ? error.message : t("indexing_failed")); } finally { - setIndexingConnectorId(null); - setSelectedConnectorForIndexing(null); - setSelectedFolderId(""); - setDriveFolders([]); - } + setIndexingConnectorId(null); + setSelectedConnectorForIndexing(null); + setSelectedFolders([]); + setDriveFolders([]); + } }; // Handle connector indexing with dates @@ -683,66 +681,62 @@ export default function ConnectorsPage() { {/* Google Drive Folder Selection Dialog */} - - Select Google Drive Folder - - Browse and select a folder to index. Click folders to expand and see subfolders. - - -
+ + Select Google Drive Folders + + + + Select folders to index. Only files directly in each folder will be + processed—subfolders must be selected separately. + + + +
- {selectedConnectorForIndexing && ( - { - setSelectedFolderId(folderId); - setSelectedFolderName(folderName); - }} - /> - )} -

- Changes to files in this folder will be automatically detected and re-indexed. -

-
- {selectedFolderId && selectedFolderName && ( -
-
-

Selected folder:

-

- {selectedFolderName} -

-
-
-

What will be indexed:

-
    -
  • Google Docs, Sheets, Slides (as PDFs)
  • -
  • PDFs, Word, Excel, PowerPoint files
  • -
  • Text files, markdown, code files
  • -
  • Images (with OCR if enabled)
  • -
-
-
+ {selectedConnectorForIndexing && ( + { + setSelectedFolders(folders); + }} + /> )}
+ {selectedFolders.length > 0 && ( +
+
+

+ Selected {selectedFolders.length} folder{selectedFolders.length > 1 ? "s" : ""}: +

+
+ {selectedFolders.map((folder) => ( +

+ • {folder.name} +

+ ))} +
+
+
+ )} +
- - + onClick={() => { + setDriveFolderDialogOpen(false); + setSelectedConnectorForIndexing(null); + setSelectedFolders([]); + setDriveFolders([]); + }} + > + {tCommon("cancel")} + + +
From 634eeb887e35ebc173c2de43e255d0d3739021e1 Mon Sep 17 00:00:00 2001 From: CREDO23 Date: Sun, 28 Dec 2025 16:49:20 +0200 Subject: [PATCH 26/39] feat(routes): support multiple Google Drive folder indexing - Accept comma-separated folder_ids and folder_names - Loop through each folder and index sequentially - Collect total indexed count and errors - Update timestamp only on full success --- .../routes/search_source_connectors_routes.py | 56 +++++++++++++------ 1 file changed, 38 insertions(+), 18 deletions(-) diff --git a/surfsense_backend/app/routes/search_source_connectors_routes.py b/surfsense_backend/app/routes/search_source_connectors_routes.py index d530163f4..af1f18513 100644 --- a/surfsense_backend/app/routes/search_source_connectors_routes.py +++ b/surfsense_backend/app/routes/search_source_connectors_routes.py @@ -1548,35 +1548,55 @@ async def run_google_drive_indexing( connector_id: int, search_space_id: int, user_id: str, - folder_id: str, - folder_name: str, + folder_ids: str, # Comma-separated folder IDs + folder_names: str, # Comma-separated folder names ): - """Runs the Google Drive indexing task and updates the timestamp.""" + """Runs the Google Drive indexing task for multiple folders and updates the timestamp.""" try: from app.tasks.connector_indexers.google_drive_indexer import ( index_google_drive_files, ) - indexed_count, error_message = await index_google_drive_files( - session, - connector_id, - search_space_id, - user_id, - folder_id, - folder_name, - use_delta_sync=True, - update_last_indexed=False, - ) - if error_message: + # Split comma-separated IDs and names into lists + folder_id_list = [fid.strip() for fid in folder_ids.split(",")] + folder_name_list = [fname.strip() for fname in folder_names.split(",")] + + total_indexed = 0 + errors = [] + + # Index each folder + for folder_id, folder_name in zip(folder_id_list, folder_name_list): + try: + indexed_count, error_message = await index_google_drive_files( + session, + connector_id, + search_space_id, + user_id, + folder_id, + folder_name, + use_delta_sync=True, + update_last_indexed=False, + ) + if error_message: + errors.append(f"{folder_name}: {error_message}") + else: + total_indexed += indexed_count + except Exception as e: + errors.append(f"{folder_name}: {str(e)}") + logger.error( + f"Error indexing folder {folder_name} ({folder_id}): {e}", + exc_info=True, + ) + + if errors: logger.error( - f"Google Drive indexing failed for connector {connector_id}: {error_message}" + f"Google Drive indexing completed with errors for connector {connector_id}: {'; '.join(errors)}" ) - # Optionally update status in DB to indicate failure else: logger.info( - f"Google Drive indexing successful for connector {connector_id}. Indexed {indexed_count} documents." + f"Google Drive indexing successful for connector {connector_id}. Indexed {total_indexed} documents from {len(folder_id_list)} folder(s)." ) - # Update the last indexed timestamp only on success + # Update the last indexed timestamp only on full success await update_connector_last_indexed(session, connector_id) await session.commit() # Commit timestamp update except Exception as e: From c9815fd6fb78037629409dd25673807122514dc4 Mon Sep 17 00:00:00 2001 From: CREDO23 Date: Sun, 28 Dec 2025 16:49:47 +0200 Subject: [PATCH 27/39] feat(celery): update Google Drive task for multiple folders - Accept comma-separated folder_ids and folder_names parameters - Pass through to indexing function for batch processing --- .../app/tasks/celery_tasks/connector_tasks.py | 20 +++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/surfsense_backend/app/tasks/celery_tasks/connector_tasks.py b/surfsense_backend/app/tasks/celery_tasks/connector_tasks.py index 8e507915f..44f57d464 100644 --- a/surfsense_backend/app/tasks/celery_tasks/connector_tasks.py +++ b/surfsense_backend/app/tasks/celery_tasks/connector_tasks.py @@ -479,10 +479,10 @@ def index_google_drive_files_task( connector_id: int, search_space_id: int, user_id: str, - folder_id: str, - folder_name: str, + folder_ids: str, # Comma-separated folder IDs + folder_names: str, # Comma-separated folder names ): - """Celery task to index Google Drive files.""" + """Celery task to index Google Drive files from multiple folders.""" import asyncio loop = asyncio.new_event_loop() @@ -494,8 +494,8 @@ def index_google_drive_files_task( connector_id, search_space_id, user_id, - folder_id, - folder_name, + folder_ids, + folder_names, ) ) finally: @@ -506,10 +506,10 @@ async def _index_google_drive_files( connector_id: int, search_space_id: int, user_id: str, - folder_id: str, - folder_name: str, + folder_ids: str, # Comma-separated folder IDs + folder_names: str, # Comma-separated folder names ): - """Index Google Drive files with new session.""" + """Index Google Drive files from multiple folders with new session.""" from app.routes.search_source_connectors_routes import ( run_google_drive_indexing, ) @@ -520,8 +520,8 @@ async def _index_google_drive_files( connector_id, search_space_id, user_id, - folder_id, - folder_name, + folder_ids, + folder_names, ) From 9f1fd20944d46a9475ec68b826addcfb3ce61f6c Mon Sep 17 00:00:00 2001 From: CREDO23 Date: Sun, 28 Dec 2025 16:55:14 +0200 Subject: [PATCH 28/39] feat(connectors): mark Google Drive documents with GOOGLE_DRIVE_CONNECTOR type - Change document_type from file type (PDF, DOCX) to GOOGLE_DRIVE_CONNECTOR - Store original file type in metadata for reference - Add Google Drive specific metadata (file_id, mime_type, source) - Include export format info for Google Workspace files - Enables proper source tracking and bulk management --- .../google_drive/content_extractor.py | 34 +++++++++++++++++-- 1 file changed, 32 insertions(+), 2 deletions(-) diff --git a/surfsense_backend/app/connectors/google_drive/content_extractor.py b/surfsense_backend/app/connectors/google_drive/content_extractor.py index 82b8d42b3..88aca8f46 100644 --- a/surfsense_backend/app/connectors/google_drive/content_extractor.py +++ b/surfsense_backend/app/connectors/google_drive/content_extractor.py @@ -94,7 +94,7 @@ async def download_and_process_file( ) logger.info(f"Processing {file_name} with Surfsense's file processor") - result = await process_file_in_background( + document = await process_file_in_background( file_path=temp_file_path, filename=file_name, search_space_id=search_space_id, @@ -104,8 +104,38 @@ async def download_and_process_file( log_entry=log_entry, ) + # Step 3: Update document type to GOOGLE_DRIVE_CONNECTOR and add metadata + if document: + from app.db import DocumentType + + # Store original file type in metadata before changing document_type + original_type = document.document_type + + # Update document type to mark it as from Google Drive + document.document_type = DocumentType.GOOGLE_DRIVE_CONNECTOR + + # Add Google Drive specific metadata + if not document.metadata: + document.metadata = {} + + document.metadata.update({ + "google_drive_file_id": file_id, + "google_drive_file_name": file_name, + "google_drive_mime_type": mime_type, + "original_document_type": original_type, + "source_connector": "google_drive", + }) + + # If it was a Google Workspace file, note the export format + if is_google_workspace_file(mime_type): + document.metadata["exported_as"] = "pdf" + document.metadata["original_workspace_type"] = mime_type.split(".")[-1] # e.g., "document", "spreadsheet" + + await session.flush() # Persist the changes + logger.info(f"Updated document type to GOOGLE_DRIVE_CONNECTOR for {file_name}") + # process_file_in_background returns None on duplicate/error, Document on success - return result, None + return document, None except Exception as e: logger.warning(f"Failed to process {file_name}: {e!s}") From b2b891e4d746b0d2add1f7f3bf0fb6f341e9ee85 Mon Sep 17 00:00:00 2001 From: CREDO23 Date: Sun, 28 Dec 2025 17:15:29 +0200 Subject: [PATCH 29/39] fix(connectors): properly commit Google Drive document type changes - Return file metadata from content_extractor for indexer to use - Update document type and metadata in indexer after processing - Explicitly commit changes to database - Ensures documents are properly marked as GOOGLE_DRIVE_CONNECTOR type --- .../google_drive/content_extractor.py | 55 +++++++------------ .../google_drive_indexer.py | 26 ++++++++- 2 files changed, 44 insertions(+), 37 deletions(-) diff --git a/surfsense_backend/app/connectors/google_drive/content_extractor.py b/surfsense_backend/app/connectors/google_drive/content_extractor.py index 88aca8f46..005e7b0ae 100644 --- a/surfsense_backend/app/connectors/google_drive/content_extractor.py +++ b/surfsense_backend/app/connectors/google_drive/content_extractor.py @@ -29,7 +29,7 @@ async def download_and_process_file( session: AsyncSession, task_logger: TaskLoggingService, log_entry: Log, -) -> tuple[Any, str | None]: +) -> tuple[Any, str | None, dict[str, Any] | None]: """ Download Google Drive file and process using Surfsense's existing infrastructure. @@ -45,7 +45,7 @@ async def download_and_process_file( log_entry: Log entry for tracking Returns: - Tuple of (Document object if successful, error message if failed) + Tuple of (Document object if successful, error message if failed, file metadata dict) """ file_id = file.get("id") file_name = file.get("name", "Unknown") @@ -53,7 +53,7 @@ async def download_and_process_file( # Skip folders and shortcuts if should_skip_file(mime_type): - return None, f"Skipping {mime_type}" + return None, f"Skipping {mime_type}", None logger.info(f"Downloading file: {file_name} ({mime_type})") @@ -104,42 +104,27 @@ async def download_and_process_file( log_entry=log_entry, ) - # Step 3: Update document type to GOOGLE_DRIVE_CONNECTOR and add metadata - if document: - from app.db import DocumentType - - # Store original file type in metadata before changing document_type - original_type = document.document_type - - # Update document type to mark it as from Google Drive - document.document_type = DocumentType.GOOGLE_DRIVE_CONNECTOR - - # Add Google Drive specific metadata - if not document.metadata: - document.metadata = {} - - document.metadata.update({ - "google_drive_file_id": file_id, - "google_drive_file_name": file_name, - "google_drive_mime_type": mime_type, - "original_document_type": original_type, - "source_connector": "google_drive", - }) - - # If it was a Google Workspace file, note the export format - if is_google_workspace_file(mime_type): - document.metadata["exported_as"] = "pdf" - document.metadata["original_workspace_type"] = mime_type.split(".")[-1] # e.g., "document", "spreadsheet" - - await session.flush() # Persist the changes - logger.info(f"Updated document type to GOOGLE_DRIVE_CONNECTOR for {file_name}") - + # Note: Document type update happens in the indexer after this returns + # to ensure proper session management and commit timing + + # Prepare file metadata for the indexer to use + file_metadata = { + "google_drive_file_id": file_id, + "google_drive_file_name": file_name, + "google_drive_mime_type": mime_type, + } + + # If it was a Google Workspace file, note the export format + if is_google_workspace_file(mime_type): + file_metadata["exported_as"] = "pdf" + file_metadata["original_workspace_type"] = mime_type.split(".")[-1] # e.g., "document", "spreadsheet" + # process_file_in_background returns None on duplicate/error, Document on success - return document, None + return document, None, file_metadata except Exception as e: logger.warning(f"Failed to process {file_name}: {e!s}") - return None, str(e) + return None, str(e), None finally: # Cleanup temp file (if process_file_in_background didn't already delete it) diff --git a/surfsense_backend/app/tasks/connector_indexers/google_drive_indexer.py b/surfsense_backend/app/tasks/connector_indexers/google_drive_indexer.py index 9c4d446de..9ed295424 100644 --- a/surfsense_backend/app/tasks/connector_indexers/google_drive_indexer.py +++ b/surfsense_backend/app/tasks/connector_indexers/google_drive_indexer.py @@ -388,7 +388,7 @@ async def _process_single_file( # Download and process using Surfsense's existing infrastructure # This handles: markdown, audio, PDFs, Office docs, images, etc. # It also handles: deduplication, chunking, summarization, embedding - document, error = await download_and_process_file( + document, error, file_metadata = await download_and_process_file( client=drive_client, file=file, search_space_id=search_space_id, @@ -407,7 +407,28 @@ async def _process_single_file( ) return 0, 1 - if document: + if document and file_metadata: + # Update document type to GOOGLE_DRIVE_CONNECTOR and add metadata + original_type = document.document_type + document.document_type = DocumentType.GOOGLE_DRIVE_CONNECTOR + + # Add Google Drive specific metadata + if not document.metadata: + document.metadata = {} + + document.metadata.update({ + **file_metadata, + "original_document_type": original_type, + "source_connector": "google_drive", + }) + + # Commit the document type and metadata changes + await session.commit() + + logger.info( + f"Updated document {document.id} to GOOGLE_DRIVE_CONNECTOR type with metadata" + ) + # Successfully indexed await task_logger.log_task_progress( log_entry, @@ -416,6 +437,7 @@ async def _process_single_file( "status": "indexed", "document_id": document.id, "file_name": file_name, + "document_type": DocumentType.GOOGLE_DRIVE_CONNECTOR, }, ) return 1, 0 From 8da58be9e01406161b99d73bc6521b0f45511f16 Mon Sep 17 00:00:00 2001 From: CREDO23 Date: Sun, 28 Dec 2025 17:21:44 +0200 Subject: [PATCH 30/39] fix(connectors): refresh document from DB before updating type - Query document from database to ensure it's attached to session - Prevents detached instance errors after process_file_in_background commits - Properly updates document_type and metadata with session management --- .../connector_indexers/google_drive_indexer.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/surfsense_backend/app/tasks/connector_indexers/google_drive_indexer.py b/surfsense_backend/app/tasks/connector_indexers/google_drive_indexer.py index 9ed295424..190792f1a 100644 --- a/surfsense_backend/app/tasks/connector_indexers/google_drive_indexer.py +++ b/surfsense_backend/app/tasks/connector_indexers/google_drive_indexer.py @@ -408,6 +408,20 @@ async def _process_single_file( return 0, 1 if document and file_metadata: + # Refresh document from database to ensure it's attached to session + from app.db import Document + from sqlalchemy import select + + # Get fresh document from database + result = await session.execute( + select(Document).where(Document.id == document.id) + ) + document = result.scalar_one_or_none() + + if not document: + logger.error(f"Could not find document {document.id} in database") + return 0, 1 + # Update document type to GOOGLE_DRIVE_CONNECTOR and add metadata original_type = document.document_type document.document_type = DocumentType.GOOGLE_DRIVE_CONNECTOR From a5935bc6775d13e9c321e49a0ef6809012042f1a Mon Sep 17 00:00:00 2001 From: CREDO23 Date: Sun, 28 Dec 2025 18:01:39 +0200 Subject: [PATCH 31/39] feat(connectors): add connector parameter to file processor for source tracking - Add optional 'connector' parameter with 'type' and 'metadata' fields - Create helper function _update_document_from_connector - Use document_metadata column (not metadata) for JSON field - Merge metadata with existing using dict spread operator - Google Drive documents now marked as GOOGLE_DRIVE_CONNECTOR - Backward compatible - no changes to existing logic - Simple and clean implementation --- .../google_drive/content_extractor.py | 39 +++++++------ .../google_drive_indexer.py | 58 ++----------------- .../document_processors/file_processors.py | 34 +++++++++++ 3 files changed, 60 insertions(+), 71 deletions(-) diff --git a/surfsense_backend/app/connectors/google_drive/content_extractor.py b/surfsense_backend/app/connectors/google_drive/content_extractor.py index 005e7b0ae..04c48f47f 100644 --- a/surfsense_backend/app/connectors/google_drive/content_extractor.py +++ b/surfsense_backend/app/connectors/google_drive/content_extractor.py @@ -92,9 +92,26 @@ async def download_and_process_file( from app.tasks.document_processors.file_processors import ( process_file_in_background, ) + from app.db import DocumentType + + # Prepare connector info + connector_info = { + "type": DocumentType.GOOGLE_DRIVE_CONNECTOR, + "metadata": { + "google_drive_file_id": file_id, + "google_drive_file_name": file_name, + "google_drive_mime_type": mime_type, + "source_connector": "google_drive", + }, + } + + # If it was a Google Workspace file, note the export format + if is_google_workspace_file(mime_type): + connector_info["metadata"]["exported_as"] = "pdf" + connector_info["metadata"]["original_workspace_type"] = mime_type.split(".")[-1] logger.info(f"Processing {file_name} with Surfsense's file processor") - document = await process_file_in_background( + await process_file_in_background( file_path=temp_file_path, filename=file_name, search_space_id=search_space_id, @@ -102,25 +119,11 @@ async def download_and_process_file( session=session, task_logger=task_logger, log_entry=log_entry, + connector=connector_info, # Pass connector info ) - # Note: Document type update happens in the indexer after this returns - # to ensure proper session management and commit timing - - # Prepare file metadata for the indexer to use - file_metadata = { - "google_drive_file_id": file_id, - "google_drive_file_name": file_name, - "google_drive_mime_type": mime_type, - } - - # If it was a Google Workspace file, note the export format - if is_google_workspace_file(mime_type): - file_metadata["exported_as"] = "pdf" - file_metadata["original_workspace_type"] = mime_type.split(".")[-1] # e.g., "document", "spreadsheet" - - # process_file_in_background returns None on duplicate/error, Document on success - return document, None, file_metadata + # process_file_in_background doesn't return the document + return None, None, connector_info["metadata"] except Exception as e: logger.warning(f"Failed to process {file_name}: {e!s}") diff --git a/surfsense_backend/app/tasks/connector_indexers/google_drive_indexer.py b/surfsense_backend/app/tasks/connector_indexers/google_drive_indexer.py index 190792f1a..a2899853e 100644 --- a/surfsense_backend/app/tasks/connector_indexers/google_drive_indexer.py +++ b/surfsense_backend/app/tasks/connector_indexers/google_drive_indexer.py @@ -388,7 +388,8 @@ async def _process_single_file( # Download and process using Surfsense's existing infrastructure # This handles: markdown, audio, PDFs, Office docs, images, etc. # It also handles: deduplication, chunking, summarization, embedding - document, error, file_metadata = await download_and_process_file( + # Document type is set to GOOGLE_DRIVE_CONNECTOR during processing + _, error, _ = await download_and_process_file( client=drive_client, file=file, search_space_id=search_space_id, @@ -407,58 +408,9 @@ async def _process_single_file( ) return 0, 1 - if document and file_metadata: - # Refresh document from database to ensure it's attached to session - from app.db import Document - from sqlalchemy import select - - # Get fresh document from database - result = await session.execute( - select(Document).where(Document.id == document.id) - ) - document = result.scalar_one_or_none() - - if not document: - logger.error(f"Could not find document {document.id} in database") - return 0, 1 - - # Update document type to GOOGLE_DRIVE_CONNECTOR and add metadata - original_type = document.document_type - document.document_type = DocumentType.GOOGLE_DRIVE_CONNECTOR - - # Add Google Drive specific metadata - if not document.metadata: - document.metadata = {} - - document.metadata.update({ - **file_metadata, - "original_document_type": original_type, - "source_connector": "google_drive", - }) - - # Commit the document type and metadata changes - await session.commit() - - logger.info( - f"Updated document {document.id} to GOOGLE_DRIVE_CONNECTOR type with metadata" - ) - - # Successfully indexed - await task_logger.log_task_progress( - log_entry, - f"Successfully indexed: {file_name}", - { - "status": "indexed", - "document_id": document.id, - "file_name": file_name, - "document_type": DocumentType.GOOGLE_DRIVE_CONNECTOR, - }, - ) - return 1, 0 - else: - # Likely a duplicate or unsupported type - logger.info(f"No document created for {file_name} (duplicate or unsupported)") - return 0, 1 + # File was processed successfully (document type already set in processor) + logger.info(f"Successfully indexed Google Drive file: {file_name}") + return 1, 0 except Exception as e: logger.error(f"Error processing file {file_name}: {e!s}", exc_info=True) diff --git a/surfsense_backend/app/tasks/document_processors/file_processors.py b/surfsense_backend/app/tasks/document_processors/file_processors.py index a32e75a32..61f484ae1 100644 --- a/surfsense_backend/app/tasks/document_processors/file_processors.py +++ b/surfsense_backend/app/tasks/document_processors/file_processors.py @@ -447,6 +447,24 @@ async def add_received_file_document_using_docling( ) from e +async def _update_document_from_connector( + document: Document | None, connector: dict | None, session: AsyncSession +) -> None: + """Helper to update document type and metadata from connector info.""" + if document and connector: + if "type" in connector: + document.document_type = connector["type"] + if "metadata" in connector: + # Merge with existing document_metadata (the actual column name) + if not document.document_metadata: + document.document_metadata = connector["metadata"] + else: + # Expand existing metadata with connector metadata + merged = {**document.document_metadata, **connector["metadata"]} + document.document_metadata = merged + await session.commit() + + async def process_file_in_background( file_path: str, filename: str, @@ -455,6 +473,7 @@ async def process_file_in_background( session: AsyncSession, task_logger: TaskLoggingService, log_entry: Log, + connector: dict | None = None, # Optional: {"type": "GOOGLE_DRIVE_CONNECTOR", "metadata": {...}} ): try: # Check if the file is a markdown or text file @@ -492,6 +511,9 @@ async def process_file_in_background( session, filename, markdown_content, search_space_id, user_id ) + # Update from connector if provided + await _update_document_from_connector(result, connector, session) + if result: await task_logger.log_task_success( log_entry, @@ -608,6 +630,9 @@ async def process_file_in_background( session, filename, transcribed_text, search_space_id, user_id ) + # Update from connector if provided + await _update_document_from_connector(result, connector, session) + if result: await task_logger.log_task_success( log_entry, @@ -753,6 +778,9 @@ async def process_file_in_background( session, filename, docs, search_space_id, user_id ) + # Update from connector if provided + await _update_document_from_connector(result, connector, session) + if result: # Update page usage after successful processing # allow_exceed=True because document was already created after passing initial check @@ -897,6 +925,9 @@ async def process_file_in_background( user_id, final_page_count, allow_exceed=True ) + # Update from connector if provided + await _update_document_from_connector(last_created_doc, connector, session) + await task_logger.log_task_success( log_entry, f"Successfully processed file with LlamaCloud: {filename}", @@ -1021,6 +1052,9 @@ async def process_file_in_background( user_id, final_page_count, allow_exceed=True ) + # Update from connector if provided + await _update_document_from_connector(doc_result, connector, session) + await task_logger.log_task_success( log_entry, f"Successfully processed file with Docling: {filename}", From 506a9297a90c6fcf64a983a8b9d850c9398ad7dc Mon Sep 17 00:00:00 2001 From: CREDO23 Date: Sun, 28 Dec 2025 18:32:59 +0200 Subject: [PATCH 32/39] fix(connectors): track delta sync tokens per folder for Google Drive - Store tokens in folder_tokens dict instead of single global token - Each folder now tracks its own sync state independently - Fixes issue where indexing folder 2 incorrectly used delta sync after folder 1 was indexed - First-time indexing now correctly uses full scan for each new folder --- .../tasks/connector_indexers/google_drive_indexer.py | 10 +++++++--- surfsense_web/contracts/types/document.types.ts | 1 + 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/surfsense_backend/app/tasks/connector_indexers/google_drive_indexer.py b/surfsense_backend/app/tasks/connector_indexers/google_drive_indexer.py index a2899853e..335c3b41d 100644 --- a/surfsense_backend/app/tasks/connector_indexers/google_drive_indexer.py +++ b/surfsense_backend/app/tasks/connector_indexers/google_drive_indexer.py @@ -112,8 +112,9 @@ async def index_google_drive_files( logger.info(f"Indexing Google Drive folder: {target_folder_name} ({target_folder_id})") - # Decide sync strategy - start_page_token = connector.config.get("start_page_token") + # Decide sync strategy - track tokens per folder + folder_tokens = connector.config.get("folder_tokens", {}) + start_page_token = folder_tokens.get(target_folder_id) can_use_delta_sync = use_delta_sync and start_page_token and connector.last_indexed_at if can_use_delta_sync: @@ -156,7 +157,10 @@ async def index_google_drive_files( if new_token and not token_error: from sqlalchemy.orm.attributes import flag_modified - connector.config["start_page_token"] = new_token + # Store token per folder + if "folder_tokens" not in connector.config: + connector.config["folder_tokens"] = {} + connector.config["folder_tokens"][target_folder_id] = new_token flag_modified(connector, "config") await update_connector_last_indexed(session, connector, update_last_indexed) diff --git a/surfsense_web/contracts/types/document.types.ts b/surfsense_web/contracts/types/document.types.ts index 3ce5388dd..b2cdb79c3 100644 --- a/surfsense_web/contracts/types/document.types.ts +++ b/surfsense_web/contracts/types/document.types.ts @@ -15,6 +15,7 @@ export const documentTypeEnum = z.enum([ "CLICKUP_CONNECTOR", "GOOGLE_CALENDAR_CONNECTOR", "GOOGLE_GMAIL_CONNECTOR", + "GOOGLE_DRIVE_CONNECTOR", "AIRTABLE_CONNECTOR", "LUMA_CONNECTOR", "ELASTICSEARCH_CONNECTOR", From acf47e3b0cb6b4ba24defee4d38f07b10abad493 Mon Sep 17 00:00:00 2001 From: CREDO23 Date: Sun, 28 Dec 2025 18:53:13 +0200 Subject: [PATCH 33/39] refactor(connectors): remove verbose docstrings and obvious comments - Simplify module docstrings (remove meta-commentary about 'small focused modules') - Remove redundant inline comments (e.g., 'Log task start', 'Get connector from database') - Trim verbose function docstrings to essential information only - Remove over-explanatory comments that restate what code does - Keep necessary documentation, remove noise for better readability --- .../app/connectors/google_drive/__init__.py | 6 +--- .../connectors/google_drive/change_tracker.py | 10 +----- .../app/connectors/google_drive/client.py | 15 ++------- .../google_drive/content_extractor.py | 20 ++---------- .../connectors/google_drive/credentials.py | 13 +------- .../app/connectors/google_drive/file_types.py | 9 +----- .../connectors/google_drive/folder_manager.py | 17 ++-------- .../google_drive_indexer.py | 32 +------------------ 8 files changed, 12 insertions(+), 110 deletions(-) diff --git a/surfsense_backend/app/connectors/google_drive/__init__.py b/surfsense_backend/app/connectors/google_drive/__init__.py index c50135155..6e0d25725 100644 --- a/surfsense_backend/app/connectors/google_drive/__init__.py +++ b/surfsense_backend/app/connectors/google_drive/__init__.py @@ -1,8 +1,4 @@ -""" -Google Drive Connector Module. - -Simple, modular approach to Google Drive indexing. -""" +"""Google Drive Connector Module.""" from .change_tracker import categorize_change, fetch_all_changes, get_start_page_token from .client import GoogleDriveClient diff --git a/surfsense_backend/app/connectors/google_drive/change_tracker.py b/surfsense_backend/app/connectors/google_drive/change_tracker.py index 1c697af5f..860e2dbef 100644 --- a/surfsense_backend/app/connectors/google_drive/change_tracker.py +++ b/surfsense_backend/app/connectors/google_drive/change_tracker.py @@ -1,9 +1,4 @@ -""" -Change Tracking for Google Drive - Delta Sync Support. - -Handles change detection and incremental syncing using Drive API's changes endpoint. -Small, focused module for tracking file modifications. -""" +"""Change tracking for Google Drive delta sync.""" import logging from datetime import datetime @@ -110,7 +105,6 @@ async def _filter_changes_by_folder( for change in changes: file = change.get("file") if not file: - # File was removed filtered.append(change) continue @@ -147,7 +141,6 @@ def categorize_change(change: dict[str, Any]) -> str: if file.get("trashed"): return "trashed" - # Check if file was recently created created_time = file.get("createdTime") modified_time = file.get("modifiedTime") @@ -198,7 +191,6 @@ async def fetch_all_changes( all_changes.extend(changes) - # If next_token is None, we've reached the end if not next_token or next_token == current_token: break diff --git a/surfsense_backend/app/connectors/google_drive/client.py b/surfsense_backend/app/connectors/google_drive/client.py index 6d2d0abfd..5053aa449 100644 --- a/surfsense_backend/app/connectors/google_drive/client.py +++ b/surfsense_backend/app/connectors/google_drive/client.py @@ -1,9 +1,4 @@ -""" -Google Drive API Client. - -Core client for interacting with Google Drive API. -Handles service initialization and basic file operations. -""" +"""Google Drive API client.""" from typing import Any @@ -16,12 +11,7 @@ from .credentials import get_valid_credentials class GoogleDriveClient: - """ - Main client for Google Drive API operations. - - Handles service initialization and provides methods for - listing files, getting metadata, and downloading content. - """ + """Client for Google Drive API operations.""" def __init__(self, session: AsyncSession, connector_id: int): """ @@ -140,7 +130,6 @@ class GoogleDriveClient: service = await self.get_service() request = service.files().get_media(fileId=file_id) - # Execute the download import io fh = io.BytesIO() diff --git a/surfsense_backend/app/connectors/google_drive/content_extractor.py b/surfsense_backend/app/connectors/google_drive/content_extractor.py index 04c48f47f..00211957a 100644 --- a/surfsense_backend/app/connectors/google_drive/content_extractor.py +++ b/surfsense_backend/app/connectors/google_drive/content_extractor.py @@ -1,8 +1,4 @@ -""" -Content Extraction for Google Drive Files. - -Downloads files and delegates to Surfsense's existing file processors. -""" +"""Content extraction for Google Drive files.""" import logging import os @@ -31,9 +27,7 @@ async def download_and_process_file( log_entry: Log, ) -> tuple[Any, str | None, dict[str, Any] | None]: """ - Download Google Drive file and process using Surfsense's existing infrastructure. - - This is the ONLY function needed - it delegates everything to process_file_in_background. + Download Google Drive file and process using Surfsense file processors. Args: client: GoogleDriveClient instance @@ -71,10 +65,8 @@ async def download_and_process_file( if error: return None, error - # Set extension based on export format extension = ".pdf" if export_mime == "application/pdf" else ".txt" else: - # Regular files - download directly content_bytes, error = await client.download_file(file_id) if error: return None, error @@ -82,19 +74,15 @@ async def download_and_process_file( # Preserve original file extension extension = Path(file_name).suffix or ".bin" - # Save to temporary file with tempfile.NamedTemporaryFile(delete=False, suffix=extension) as tmp_file: tmp_file.write(content_bytes) temp_file_path = tmp_file.name - # Step 2: Delegate to Surfsense's existing file processor - # This handles ALL file types: markdown, audio, PDFs, Office docs, images, etc. from app.tasks.document_processors.file_processors import ( process_file_in_background, ) from app.db import DocumentType - # Prepare connector info connector_info = { "type": DocumentType.GOOGLE_DRIVE_CONNECTOR, "metadata": { @@ -105,7 +93,6 @@ async def download_and_process_file( }, } - # If it was a Google Workspace file, note the export format if is_google_workspace_file(mime_type): connector_info["metadata"]["exported_as"] = "pdf" connector_info["metadata"]["original_workspace_type"] = mime_type.split(".")[-1] @@ -119,10 +106,9 @@ async def download_and_process_file( session=session, task_logger=task_logger, log_entry=log_entry, - connector=connector_info, # Pass connector info + connector=connector_info, ) - # process_file_in_background doesn't return the document return None, None, connector_info["metadata"] except Exception as e: diff --git a/surfsense_backend/app/connectors/google_drive/credentials.py b/surfsense_backend/app/connectors/google_drive/credentials.py index 5d09df881..4c1ef9c03 100644 --- a/surfsense_backend/app/connectors/google_drive/credentials.py +++ b/surfsense_backend/app/connectors/google_drive/credentials.py @@ -1,9 +1,4 @@ -""" -Google Drive OAuth Credentials Management. - -Handles credential validation, token refresh, and persistence to database. -Small, focused module for credential operations only. -""" +"""Google Drive OAuth credential management.""" import json from datetime import datetime @@ -35,7 +30,6 @@ async def get_valid_credentials( ValueError: If credentials are missing or invalid Exception: If token refresh fails """ - # Fetch connector from database result = await session.execute( select(SearchSourceConnector).filter( SearchSourceConnector.id == connector_id @@ -46,11 +40,9 @@ async def get_valid_credentials( if not connector: raise ValueError(f"Connector {connector_id} not found") - # Extract credentials from config config_data = connector.config exp = config_data.get("expiry", "").replace("Z", "") - # Validate required fields if not all( [ config_data.get("client_id"), @@ -62,7 +54,6 @@ async def get_valid_credentials( "Google OAuth credentials (client_id, client_secret, refresh_token) must be set" ) - # Create credentials object credentials = Credentials( token=config_data.get("token"), refresh_token=config_data.get("refresh_token"), @@ -73,12 +64,10 @@ async def get_valid_credentials( expiry=datetime.fromisoformat(exp) if exp else None, ) - # Refresh token if expired if credentials.expired or not credentials.valid: try: credentials.refresh(Request()) - # Persist refreshed token to database connector.config = json.loads(credentials.to_json()) flag_modified(connector, "config") await session.commit() diff --git a/surfsense_backend/app/connectors/google_drive/file_types.py b/surfsense_backend/app/connectors/google_drive/file_types.py index f66680c6c..cb2354585 100644 --- a/surfsense_backend/app/connectors/google_drive/file_types.py +++ b/surfsense_backend/app/connectors/google_drive/file_types.py @@ -1,18 +1,11 @@ -""" -File Type Handlers for Google Drive. +"""File type handlers for Google Drive.""" -Simple module for basic file type detection. -""" - -# Google Workspace MIME types that need export GOOGLE_DOC = "application/vnd.google-apps.document" GOOGLE_SHEET = "application/vnd.google-apps.spreadsheet" GOOGLE_SLIDE = "application/vnd.google-apps.presentation" GOOGLE_FOLDER = "application/vnd.google-apps.folder" GOOGLE_SHORTCUT = "application/vnd.google-apps.shortcut" -# Export MIME types for Google Workspace files -# Export as PDF to preserve formatting, images, and structure EXPORT_FORMATS = { GOOGLE_DOC: "application/pdf", GOOGLE_SHEET: "application/pdf", diff --git a/surfsense_backend/app/connectors/google_drive/folder_manager.py b/surfsense_backend/app/connectors/google_drive/folder_manager.py index da9deb75d..599475a46 100644 --- a/surfsense_backend/app/connectors/google_drive/folder_manager.py +++ b/surfsense_backend/app/connectors/google_drive/folder_manager.py @@ -1,9 +1,4 @@ -""" -Folder Management for Google Drive. - -Handles folder listing, selection, and hierarchy operations. -Small, focused module for folder-related operations. -""" +"""Folder management for Google Drive.""" import logging from typing import Any @@ -165,11 +160,7 @@ async def list_folder_contents( parent_id: str | None = None, ) -> tuple[list[dict[str, Any]], str | None]: """ - List both folders and files in a Google Drive folder. - - Fetches ALL items using pagination (handles folders with >100 items). - Returns items sorted with folders first, then files. - Each item includes 'isFolder' boolean for frontend rendering. + List folders and files in a Google Drive folder with pagination support. Args: client: GoogleDriveClient instance @@ -212,20 +203,16 @@ async def list_folder_contents( all_items.extend(items) - # If no more pages, break if not next_token: break page_token = next_token - # Add 'isFolder' flag and sort (folders first, then files) for item in all_items: item["isFolder"] = item["mimeType"] == "application/vnd.google-apps.folder" - # Sort: folders first (alphabetically), then files (alphabetically) all_items.sort(key=lambda x: (not x["isFolder"], x["name"].lower())) - # Count folders and files for logging folder_count = sum(1 for item in all_items if item["isFolder"]) file_count = len(all_items) - folder_count diff --git a/surfsense_backend/app/tasks/connector_indexers/google_drive_indexer.py b/surfsense_backend/app/tasks/connector_indexers/google_drive_indexer.py index 335c3b41d..cd862e372 100644 --- a/surfsense_backend/app/tasks/connector_indexers/google_drive_indexer.py +++ b/surfsense_backend/app/tasks/connector_indexers/google_drive_indexer.py @@ -1,11 +1,4 @@ -""" -Google Drive Indexer - Delegates all processing to Surfsense's file processors. - -Handles: -- Folder-specific indexing (user selects folder) -- Delta sync (only index changed files) -- Delegates file processing to process_file_in_background -""" +"""Google Drive indexer using Surfsense file processors.""" import logging from datetime import datetime @@ -63,7 +56,6 @@ async def index_google_drive_files( """ task_logger = TaskLoggingService(session, search_space_id) - # Log task start log_entry = await task_logger.log_task_start( task_name="google_drive_files_indexing", source="connector_indexing_task", @@ -78,7 +70,6 @@ async def index_google_drive_files( ) try: - # Get connector from database connector = await get_connector_by_id( session, connector_id, SearchSourceConnectorType.GOOGLE_DRIVE_CONNECTOR ) @@ -90,7 +81,6 @@ async def index_google_drive_files( ) return 0, error_msg - # Initialize Drive client await task_logger.log_task_progress( log_entry, f"Initializing Google Drive client for connector {connector_id}", @@ -99,7 +89,6 @@ async def index_google_drive_files( drive_client = GoogleDriveClient(session, connector_id) - # Use folder from request params (required for Google Drive) if not folder_id: error_msg = "folder_id is required for Google Drive indexing" await task_logger.log_task_failure( @@ -112,7 +101,6 @@ async def index_google_drive_files( logger.info(f"Indexing Google Drive folder: {target_folder_name} ({target_folder_id})") - # Decide sync strategy - track tokens per folder folder_tokens = connector.config.get("folder_tokens", {}) start_page_token = folder_tokens.get(target_folder_id) can_use_delta_sync = use_delta_sync and start_page_token and connector.last_indexed_at @@ -150,14 +138,11 @@ async def index_google_drive_files( documents_indexed, documents_skipped = result - # Update last indexed timestamp and get new start page token if documents_indexed > 0 or can_use_delta_sync: - # Get new start page token for next sync new_token, token_error = await get_start_page_token(drive_client) if new_token and not token_error: from sqlalchemy.orm.attributes import flag_modified - # Store token per folder if "folder_tokens" not in connector.config: connector.config["folder_tokens"] = {} connector.config["folder_tokens"][target_folder_id] = new_token @@ -165,13 +150,11 @@ async def index_google_drive_files( await update_connector_last_indexed(session, connector, update_last_indexed) - # Final commit await session.commit() logger.info( f"Successfully committed Google Drive indexing changes to database" ) - # Log success await task_logger.log_task_success( log_entry, f"Successfully completed Google Drive indexing for connector {connector_id}", @@ -235,7 +218,6 @@ async def _index_full_scan( page_token = None files_processed = 0 - # Paginate through all files in folder while files_processed < max_files: files, next_token, error = await get_files_in_folder( drive_client, folder_id, include_subfolders=False, page_token=page_token @@ -254,7 +236,6 @@ async def _index_full_scan( files_processed += 1 - # Process file indexed, skipped = await _process_single_file( drive_client=drive_client, session=session, @@ -269,7 +250,6 @@ async def _index_full_scan( documents_indexed += indexed documents_skipped += skipped - # Batch commit every 10 files if documents_indexed % 10 == 0 and documents_indexed > 0: await session.commit() logger.info(f"Committed batch: {documents_indexed} files indexed so far") @@ -304,7 +284,6 @@ async def _index_with_delta_sync( {"stage": "delta_sync", "start_token": start_page_token}, ) - # Fetch all changes since last sync changes, final_token, error = await fetch_all_changes( drive_client, start_page_token, folder_id ) @@ -330,14 +309,12 @@ async def _index_with_delta_sync( files_processed += 1 change_type = categorize_change(change) - # Handle removed/trashed files if change_type in ["removed", "trashed"]: file_id = change.get("fileId") if file_id: await _remove_document(session, file_id, search_space_id) continue - # Handle modified/new files file = change.get("file") if not file: continue @@ -356,7 +333,6 @@ async def _index_with_delta_sync( documents_indexed += indexed documents_skipped += skipped - # Batch commit every 10 files if documents_indexed % 10 == 0 and documents_indexed > 0: await session.commit() logger.info(f"Committed batch: {documents_indexed} changes processed") @@ -389,10 +365,6 @@ async def _process_single_file( try: logger.info(f"Processing file: {file_name} ({mime_type})") - # Download and process using Surfsense's existing infrastructure - # This handles: markdown, audio, PDFs, Office docs, images, etc. - # It also handles: deduplication, chunking, summarization, embedding - # Document type is set to GOOGLE_DRIVE_CONNECTOR during processing _, error, _ = await download_and_process_file( client=drive_client, file=file, @@ -404,7 +376,6 @@ async def _process_single_file( ) if error: - # Log and skip - not an error, just unsupported or empty await task_logger.log_task_progress( log_entry, f"Skipped {file_name}: {error}", @@ -412,7 +383,6 @@ async def _process_single_file( ) return 0, 1 - # File was processed successfully (document type already set in processor) logger.info(f"Successfully indexed Google Drive file: {file_name}") return 1, 0 From 0b006de32dbfd0aba418920da65107acb2654db8 Mon Sep 17 00:00:00 2001 From: CREDO23 Date: Sun, 28 Dec 2025 18:59:30 +0200 Subject: [PATCH 34/39] refactor(web): clean up Google Drive folder tree component - Replace inline comments with JSDoc multiline comments for main functions - Remove obvious/noisy inline comments from JSX - Simplify component documentation while keeping it clear - Improve readability by reducing comment clutter --- .../connectors/google-drive-folder-tree.tsx | 59 ++++++------------- 1 file changed, 19 insertions(+), 40 deletions(-) diff --git a/surfsense_web/components/connectors/google-drive-folder-tree.tsx b/surfsense_web/components/connectors/google-drive-folder-tree.tsx index 793fdc750..05f4cc9e2 100644 --- a/surfsense_web/components/connectors/google-drive-folder-tree.tsx +++ b/surfsense_web/components/connectors/google-drive-folder-tree.tsx @@ -75,25 +75,23 @@ export function GoogleDriveFolderTree({ const [isLoadingRoot, setIsLoadingRoot] = useState(false); const [isInitialized, setIsInitialized] = useState(false); - // Helper to check if a folder is selected const isFolderSelected = (folderId: string): boolean => { return selectedFolders.some((f) => f.id === folderId); }; - // Handle folder checkbox toggle const toggleFolderSelection = (folderId: string, folderName: string) => { if (isFolderSelected(folderId)) { - // Remove from selection onSelectFolders(selectedFolders.filter((f) => f.id !== folderId)); } else { - // Add to selection onSelectFolders([...selectedFolders, { id: folderId, name: folderName }]); } }; - // Load root items (folders and files) on mount + /** + * Load root-level folders and files from Google Drive. + */ const loadRootItems = async () => { - if (isInitialized) return; // Already loaded + if (isInitialized) return; setIsLoadingRoot(true); try { @@ -112,17 +110,16 @@ export function GoogleDriveFolderTree({ } }; - // Helper function to find an item recursively through all loaded items + /** + * Find an item by ID across all loaded items (root and nested). + */ const findItem = (itemId: string): DriveItem | undefined => { - // First check if we have it in itemStates const state = itemStates.get(itemId); if (state?.item) return state.item; - // Check root items const rootItem = rootItems.find((item) => item.id === itemId); if (rootItem) return rootItem; - // Recursively search through all loaded children for (const [, nodeState] of itemStates) { if (nodeState.children) { const found = nodeState.children.find((child) => child.id === itemId); @@ -133,17 +130,17 @@ export function GoogleDriveFolderTree({ return undefined; }; - // Load children (folders and files) for a specific folder + /** + * Load and display contents of a specific folder. + */ const loadFolderContents = async (folderId: string) => { try { - // Set loading state setItemStates((prev) => { const newMap = new Map(prev); const existing = newMap.get(folderId); if (existing) { newMap.set(folderId, { ...existing, isLoading: true }); } else { - // First time loading this folder - create initial state const item = findItem(folderId); if (item) { newMap.set(folderId, { @@ -165,10 +162,6 @@ export function GoogleDriveFolderTree({ const data = await response.json(); const items = data.items || []; - // Check if folder only contains files (no subfolders) - const hasSubfolders = items.some((item: DriveItem) => item.isFolder); - - // Update item state with loaded children setItemStates((prev) => { const newMap = new Map(prev); const existing = newMap.get(folderId); @@ -178,7 +171,7 @@ export function GoogleDriveFolderTree({ newMap.set(folderId, { item, children: items, - isExpanded: true, // Always expand after loading + isExpanded: true, isLoading: false, }); } else { @@ -188,7 +181,6 @@ export function GoogleDriveFolderTree({ }); } catch (error) { console.error("Error loading folder contents:", error); - // Clear loading state on error setItemStates((prev) => { const newMap = new Map(prev); const existing = newMap.get(folderId); @@ -200,17 +192,17 @@ export function GoogleDriveFolderTree({ } }; - // Toggle folder expansion + /** + * Toggle folder expand/collapse state. + */ const toggleFolder = async (item: DriveItem) => { - if (!item.isFolder) return; // Only folders can be expanded + if (!item.isFolder) return; const state = itemStates.get(item.id); if (!state || state.children === null) { - // First time expanding - load children await loadFolderContents(item.id); } else { - // Toggle expansion state setItemStates((prev) => { const newMap = new Map(prev); newMap.set(item.id, { @@ -222,7 +214,9 @@ export function GoogleDriveFolderTree({ } }; - // Recursive render function for item tree + /** + * Render a single item (folder or file) with its children. + */ const renderItem = (item: DriveItem, level: number = 0) => { const state = itemStates.get(item.id); const isExpanded = state?.isExpanded || false; @@ -231,7 +225,6 @@ export function GoogleDriveFolderTree({ const isSelected = isFolderSelected(item.id); const isFolder = item.isFolder; - // Separate folders and files for children const childFolders = children?.filter((c) => c.isFolder) || []; const childFiles = children?.filter((c) => !c.isFolder) || []; @@ -245,7 +238,6 @@ export function GoogleDriveFolderTree({ isSelected && isFolder && "bg-accent/50" )} > - {/* Expand/Collapse Icon (only for folders) */} {isFolder ? ( ) : ( - // Empty space for alignment + )} - {/* Checkbox (only for folders) */} {isFolder && ( )} - {/* Icon */}
{isFolder ? ( isExpanded ? ( @@ -289,7 +279,6 @@ export function GoogleDriveFolderTree({ )}
- {/* Item Name */} isFolder && toggleFolder(item)} @@ -298,16 +287,11 @@ export function GoogleDriveFolderTree({ - {/* Render children if expanded (folders first, then files) */} {isExpanded && isFolder && children && (
- {/* Render folders first */} {childFolders.map((child) => renderItem(child, level + 1))} - - {/* Render files */} {childFiles.map((child) => renderItem(child, level + 1))} - {/* Empty state */} {children.length === 0 && (
Empty folder
)} @@ -317,7 +301,6 @@ export function GoogleDriveFolderTree({ ); }; - // Initialize on first render if (!isInitialized && !isLoadingRoot) { loadRootItems(); } @@ -326,7 +309,6 @@ export function GoogleDriveFolderTree({
- {/* My Drive Header (always visible, selectable) */}
- {/* Loading indicator */} {isLoadingRoot && (
)} - {/* Root items (folders and files) - same level as Google Drive shows */}
{!isLoadingRoot && rootItems.map((item) => renderItem(item, 0))}
- {/* Empty state */} {!isLoadingRoot && rootItems.length === 0 && (
No files or folders found in your Google Drive From 10c98745cdc3a2e7231d27dc8b05d1c9b6b609b8 Mon Sep 17 00:00:00 2001 From: CREDO23 Date: Sun, 28 Dec 2025 19:17:37 +0200 Subject: [PATCH 35/39] refactor(web): use React Query for Google Drive folder operations - Fix errors in connectors-api.service (use .issues instead of .errors) - Create useGoogleDriveFolders hook with proper React Query integration - Add Google Drive folders cache keys with proper query invalidation - Refactor GoogleDriveFolderTree to use React Query hook for root data - Remove manual state management (isInitialized, setRootItems, loadRootItems) - Remove unused state (driveFolders, isLoadingFolders) from manage page - Simplify handleOpenDriveFolderDialog function - Automatic loading, caching, error handling, and refetching via React Query - Better performance with proper caching and state management --- .../connectors/(manage)/page.tsx | 68 ++++++------------- .../connectors/google-drive-folder-tree.tsx | 49 ++++--------- .../contracts/types/connector.types.ts | 32 +++++++++ .../hooks/use-google-drive-folders.ts | 29 ++++++++ .../lib/apis/connectors-api.service.ts | 40 +++++++++-- surfsense_web/lib/query-client/cache-keys.ts | 4 ++ 6 files changed, 129 insertions(+), 93 deletions(-) create mode 100644 surfsense_web/hooks/use-google-drive-folders.ts diff --git a/surfsense_web/app/dashboard/[search_space_id]/connectors/(manage)/page.tsx b/surfsense_web/app/dashboard/[search_space_id]/connectors/(manage)/page.tsx index 5854cb706..1e0e76ca9 100644 --- a/surfsense_web/app/dashboard/[search_space_id]/connectors/(manage)/page.tsx +++ b/surfsense_web/app/dashboard/[search_space_id]/connectors/(manage)/page.tsx @@ -70,14 +70,8 @@ import { Tooltip, TooltipContent, TooltipProvider, TooltipTrigger } from "@/comp import { EnumConnectorName } from "@/contracts/enums/connector"; import { getConnectorIcon } from "@/contracts/enums/connectorIcons"; import { cn } from "@/lib/utils"; -import { authenticatedFetch } from "@/lib/auth-utils"; import { GoogleDriveFolderTree } from "@/components/connectors/google-drive-folder-tree"; -interface DriveFolder { - id: string; - name: string; -} - export default function ConnectorsPage() { const t = useTranslations("connectors"); const tCommon = useTranslations("common"); @@ -127,9 +121,7 @@ export default function ConnectorsPage() { // Google Drive folder selection state const [driveFolderDialogOpen, setDriveFolderDialogOpen] = useState(false); - const [driveFolders, setDriveFolders] = useState([]); const [selectedFolders, setSelectedFolders] = useState>([]); - const [isLoadingFolders, setIsLoadingFolders] = useState(false); useEffect(() => { if (error) { @@ -165,31 +157,9 @@ export default function ConnectorsPage() { } }; - // Handle opening Google Drive folder selection dialog - const handleOpenDriveFolderDialog = async (connectorId: number) => { + const handleOpenDriveFolderDialog = (connectorId: number) => { setSelectedConnectorForIndexing(connectorId); setDriveFolderDialogOpen(true); - setIsLoadingFolders(true); - - try { - const response = await authenticatedFetch( - `${process.env.NEXT_PUBLIC_FASTAPI_BACKEND_URL}/api/v1/connectors/${connectorId}/google-drive/folders`, - { method: "GET" } - ); - - if (!response.ok) { - throw new Error("Failed to load folders"); - } - - const data = await response.json(); - setDriveFolders(data.folders || []); - } catch (error) { - console.error("Error loading folders:", error); - toast.error("Failed to load Google Drive folders"); - setDriveFolderDialogOpen(false); - } finally { - setIsLoadingFolders(false); - } }; // Handle Google Drive folder indexing @@ -204,15 +174,17 @@ export default function ConnectorsPage() { try { setIndexingConnectorId(selectedConnectorForIndexing); - // Call indexConnector with folder_ids and folder_names as query params - await indexConnector( - selectedConnectorForIndexing, - searchSpaceId, - undefined, - undefined, - selectedFolders.map((f) => f.id).join(","), - selectedFolders.map((f) => f.name).join(", ") - ); + const folderIds = selectedFolders.map((f) => f.id).join(","); + const folderNames = selectedFolders.map((f) => f.name).join(", "); + + await indexConnector({ + connector_id: selectedConnectorForIndexing, + queryParams: { + search_space_id: searchSpaceId, + folder_ids: folderIds, + folder_names: folderNames, + }, + }); toast.success(t("indexing_started")); } catch (error) { console.error("Error indexing connector content:", error); @@ -221,7 +193,6 @@ export default function ConnectorsPage() { setIndexingConnectorId(null); setSelectedConnectorForIndexing(null); setSelectedFolders([]); - setDriveFolders([]); } }; @@ -747,14 +718,13 @@ export default function ConnectorsPage() {