SurfSense/surfsense_backend/app/connectors/google_drive/client.py

"""Google Drive API client."""

import io
from typing import Any

from google.oauth2.credentials import Credentials
from googleapiclient.discovery import build
from googleapiclient.errors import HttpError
from googleapiclient.http import MediaIoBaseUpload
from sqlalchemy.ext.asyncio import AsyncSession

from .credentials import get_valid_credentials
from .file_types import GOOGLE_DOC, GOOGLE_SHEET


class GoogleDriveClient:
    """Client for Google Drive API operations."""

    def __init__(
        self,
        session: AsyncSession,
        connector_id: int,
        credentials: "Credentials | None" = None,
    ):
        """
        Initialize Google Drive client.

        Args:
            session: Database session
            connector_id: ID of the Drive connector
            credentials: Pre-built credentials (e.g. from Composio). If None,
                         credentials are loaded from the DB connector config.
        """
        self.session = session
        self.connector_id = connector_id
        self._credentials = credentials
        self.service = None

    async def get_service(self):
        """
        Get or create the Drive service instance.

        Returns:
            Google Drive service instance

        Raises:
            Exception: If service creation fails
        """
        if self.service:
            return self.service

        try:
            if self._credentials:
                credentials = self._credentials
            else:
                credentials = await get_valid_credentials(
                    self.session, self.connector_id
                )
            self.service = build("drive", "v3", credentials=credentials)
            return self.service
        except Exception as e:
            raise Exception(f"Failed to create Google Drive service: {e!s}") from e

    async def list_files(
        self,
        query: str = "",
        fields: str = "nextPageToken, files(id, name, mimeType, modifiedTime, md5Checksum, size, webViewLink, parents, owners, createdTime, description)",
        page_size: int = 100,
        page_token: str | None = None,
    ) -> tuple[list[dict[str, Any]], str | None, str | None]:
        """
        List files from Google Drive with pagination.

        Args:
            query: Search query (e.g., "mimeType != 'application/vnd.google-apps.folder'")
            fields: Fields to retrieve
            page_size: Number of files per page (max 1000)
            page_token: Token for next page

        Returns:
            Tuple of (files list, next_page_token, error message)
        """
        try:
            service = await self.get_service()

            params = {
                "pageSize": min(page_size, 1000),
                "fields": fields,
                "supportsAllDrives": True,
                "includeItemsFromAllDrives": True,
            }

            if query:
                params["q"] = query
            if page_token:
                params["pageToken"] = page_token

            result = service.files().list(**params).execute()

            files = result.get("files", [])
            next_token = result.get("nextPageToken")

            return files, next_token, None

        except HttpError as e:
            error_msg = f"HTTP error listing files: {e.resp.status} - {e.error_details}"
            return [], None, error_msg
        except Exception as e:
            return [], None, f"Error listing files: {e!s}"

    async def get_file_metadata(
        self, file_id: str, fields: str = "*"
    ) -> tuple[dict[str, Any] | None, str | None]:
        """
        Get metadata for a specific file.

        Args:
            file_id: ID of the file
            fields: Fields to retrieve

        Returns:
            Tuple of (file metadata, error message)
        """
        try:
            service = await self.get_service()
            file = (
                service.files()
                .get(fileId=file_id, fields=fields, supportsAllDrives=True)
                .execute()
            )
            return file, None
        except HttpError as e:
            return None, f"HTTP error getting file metadata: {e.resp.status}"
        except Exception as e:
            return None, f"Error getting file metadata: {e!s}"

    async def download_file(self, file_id: str) -> tuple[bytes | None, str | None]:
        """
        Download binary file content.

        Args:
            file_id: ID of the file to download

        Returns:
            Tuple of (file content bytes, error message)
        """
        try:
            service = await self.get_service()
            request = service.files().get_media(fileId=file_id)

            import io

            fh = io.BytesIO()
            from googleapiclient.http import MediaIoBaseDownload

            downloader = MediaIoBaseDownload(fh, request)

            done = False
            while not done:
                _, done = downloader.next_chunk()

            return fh.getvalue(), None

        except HttpError as e:
            return None, f"HTTP error downloading file: {e.resp.status}"
        except Exception as e:
            return None, f"Error downloading file: {e!s}"

    async def export_google_file(
        self, file_id: str, mime_type: str
    ) -> tuple[bytes | None, str | None]:
        """
        Export Google Workspace file to specified format.

        Args:
            file_id: ID of the Google file
            mime_type: Target MIME type (e.g., 'application/pdf', 'text/plain')

        Returns:
            Tuple of (exported content as bytes, error message)
        """
        try:
            service = await self.get_service()
            content = (
                service.files().export(fileId=file_id, mimeType=mime_type).execute()
            )

            # Content is already bytes from the API
            # Keep as bytes to support both text and binary formats (like PDF)
            if not isinstance(content, bytes):
                content = content.encode("utf-8")

            return content, None

        except HttpError as e:
            return None, f"HTTP error exporting file: {e.resp.status}"
        except Exception as e:
            return None, f"Error exporting file: {e!s}"

    async def create_file(
        self,
        name: str,
        mime_type: str,
        parent_folder_id: str | None = None,
        content: str | None = None,
    ) -> dict[str, Any]:
        service = await self.get_service()

        body: dict[str, Any] = {"name": name, "mimeType": mime_type}
        if parent_folder_id:
            body["parents"] = [parent_folder_id]

        media: MediaIoBaseUpload | None = None
        if content:
            if mime_type == GOOGLE_DOC:
                import markdown as md_lib

                html = md_lib.markdown(content)
                media = MediaIoBaseUpload(
                    io.BytesIO(html.encode("utf-8")),
                    mimetype="text/html",
                    resumable=False,
                )
            elif mime_type == GOOGLE_SHEET:
                media = MediaIoBaseUpload(
                    io.BytesIO(content.encode("utf-8")),
                    mimetype="text/csv",
                    resumable=False,
                )

        if media:
            return (
                service.files()
                .create(
                    body=body,
                    media_body=media,
                    fields="id,name,mimeType,webViewLink",
                    supportsAllDrives=True,
                )
                .execute()
            )

        return (
            service.files()
            .create(
                body=body,
                fields="id,name,mimeType,webViewLink",
                supportsAllDrives=True,
            )
            .execute()
        )

    async def trash_file(self, file_id: str) -> bool:
        service = await self.get_service()
        service.files().update(
            fileId=file_id,
            body={"trashed": True},
            supportsAllDrives=True,
        ).execute()
        return True
refactor(connectors): remove verbose docstrings and obvious comments - Simplify module docstrings (remove meta-commentary about 'small focused modules') - Remove redundant inline comments (e.g., 'Log task start', 'Get connector from database') - Trim verbose function docstrings to essential information only - Remove over-explanatory comments that restate what code does - Keep necessary documentation, remove noise for better readability 2025-12-28 18:53:13 +02:00			`"""Google Drive API client."""`
feat(connectors): add Google Drive API client wrapper - Build and manage Google Drive service with credentials - List files with query support and pagination - Download binary files and export Google Workspace files as PDF - Handle HTTP errors gracefully 2025-12-28 15:54:32 +02:00
add create_file and trash_file to GoogleDriveClient 2026-02-20 16:25:25 +02:00			`import io`
feat(connectors): add Google Drive API client wrapper - Build and manage Google Drive service with credentials - List files with query support and pagination - Download binary files and export Google Workspace files as PDF - Handle HTTP errors gracefully 2025-12-28 15:54:32 +02:00			`from typing import Any`

chore: ran linting 2026-03-22 00:43:53 +05:30			`from google.oauth2.credentials import Credentials`
feat(connectors): add Google Drive API client wrapper - Build and manage Google Drive service with credentials - List files with query support and pagination - Download binary files and export Google Workspace files as PDF - Handle HTTP errors gracefully 2025-12-28 15:54:32 +02:00			`from googleapiclient.discovery import build`
			`from googleapiclient.errors import HttpError`
add create_file and trash_file to GoogleDriveClient 2026-02-20 16:25:25 +02:00			`from googleapiclient.http import MediaIoBaseUpload`
feat(connectors): add Google Drive API client wrapper - Build and manage Google Drive service with credentials - List files with query support and pagination - Download binary files and export Google Workspace files as PDF - Handle HTTP errors gracefully 2025-12-28 15:54:32 +02:00			`from sqlalchemy.ext.asyncio import AsyncSession`

			`from .credentials import get_valid_credentials`
add create_file and trash_file to GoogleDriveClient 2026-02-20 16:25:25 +02:00			`from .file_types import GOOGLE_DOC, GOOGLE_SHEET`
feat(connectors): add Google Drive API client wrapper - Build and manage Google Drive service with credentials - List files with query support and pagination - Download binary files and export Google Workspace files as PDF - Handle HTTP errors gracefully 2025-12-28 15:54:32 +02:00

			`class GoogleDriveClient:`
refactor(connectors): remove verbose docstrings and obvious comments - Simplify module docstrings (remove meta-commentary about 'small focused modules') - Remove redundant inline comments (e.g., 'Log task start', 'Get connector from database') - Trim verbose function docstrings to essential information only - Remove over-explanatory comments that restate what code does - Keep necessary documentation, remove noise for better readability 2025-12-28 18:53:13 +02:00			`"""Client for Google Drive API operations."""`
feat(connectors): add Google Drive API client wrapper - Build and manage Google Drive service with credentials - List files with query support and pagination - Download binary files and export Google Workspace files as PDF - Handle HTTP errors gracefully 2025-12-28 15:54:32 +02:00
refactor: unify all 3 google Composio and non-Composio connector types and pipelines keeping same credential adapters 2026-03-19 05:08:21 +05:30			`def __init__(`
			`self,`
			`session: AsyncSession,`
			`connector_id: int,`
			`credentials: "Credentials \| None" = None,`
			`):`
feat(connectors): add Google Drive API client wrapper - Build and manage Google Drive service with credentials - List files with query support and pagination - Download binary files and export Google Workspace files as PDF - Handle HTTP errors gracefully 2025-12-28 15:54:32 +02:00			`"""`
			`Initialize Google Drive client.`

			`Args:`
			`session: Database session`
			`connector_id: ID of the Drive connector`
refactor: unify all 3 google Composio and non-Composio connector types and pipelines keeping same credential adapters 2026-03-19 05:08:21 +05:30			`credentials: Pre-built credentials (e.g. from Composio). If None,`
			`credentials are loaded from the DB connector config.`
feat(connectors): add Google Drive API client wrapper - Build and manage Google Drive service with credentials - List files with query support and pagination - Download binary files and export Google Workspace files as PDF - Handle HTTP errors gracefully 2025-12-28 15:54:32 +02:00			`"""`
			`self.session = session`
			`self.connector_id = connector_id`
refactor: unify all 3 google Composio and non-Composio connector types and pipelines keeping same credential adapters 2026-03-19 05:08:21 +05:30			`self._credentials = credentials`
feat(connectors): add Google Drive API client wrapper - Build and manage Google Drive service with credentials - List files with query support and pagination - Download binary files and export Google Workspace files as PDF - Handle HTTP errors gracefully 2025-12-28 15:54:32 +02:00			`self.service = None`

			`async def get_service(self):`
			`"""`
			`Get or create the Drive service instance.`

			`Returns:`
			`Google Drive service instance`

			`Raises:`
			`Exception: If service creation fails`
			`"""`
			`if self.service:`
			`return self.service`

			`try:`
refactor: unify all 3 google Composio and non-Composio connector types and pipelines keeping same credential adapters 2026-03-19 05:08:21 +05:30			`if self._credentials:`
			`credentials = self._credentials`
			`else:`
			`credentials = await get_valid_credentials(`
			`self.session, self.connector_id`
			`)`
feat(connectors): add Google Drive API client wrapper - Build and manage Google Drive service with credentials - List files with query support and pagination - Download binary files and export Google Workspace files as PDF - Handle HTTP errors gracefully 2025-12-28 15:54:32 +02:00			`self.service = build("drive", "v3", credentials=credentials)`
			`return self.service`
			`except Exception as e:`
			`raise Exception(f"Failed to create Google Drive service: {e!s}") from e`

			`async def list_files(`
			`self,`
			`query: str = "",`
feat: enhance Google Drive file metadata handling - Updated Google Drive API calls to include md5Checksum in file metadata retrieval for improved content tracking. - Added logic to check for rename-only updates based on md5Checksum, optimizing document processing by preventing unnecessary ETL operations for unchanged content. - Enhanced existing document update logic to handle renaming and metadata updates more effectively, particularly for Google Drive files. 2026-01-17 16:24:53 +05:30			`fields: str = "nextPageToken, files(id, name, mimeType, modifiedTime, md5Checksum, size, webViewLink, parents, owners, createdTime, description)",`
feat(connectors): add Google Drive API client wrapper - Build and manage Google Drive service with credentials - List files with query support and pagination - Download binary files and export Google Workspace files as PDF - Handle HTTP errors gracefully 2025-12-28 15:54:32 +02:00			`page_size: int = 100,`
			`page_token: str \| None = None,`
			`) -> tuple[list[dict[str, Any]], str \| None, str \| None]:`
			`"""`
			`List files from Google Drive with pagination.`

			`Args:`
			`query: Search query (e.g., "mimeType != 'application/vnd.google-apps.folder'")`
			`fields: Fields to retrieve`
			`page_size: Number of files per page (max 1000)`
			`page_token: Token for next page`

			`Returns:`
			`Tuple of (files list, next_page_token, error message)`
			`"""`
			`try:`
			`service = await self.get_service()`

			`params = {`
			`"pageSize": min(page_size, 1000),`
			`"fields": fields,`
			`"supportsAllDrives": True,`
			`"includeItemsFromAllDrives": True,`
			`}`

			`if query:`
			`params["q"] = query`
			`if page_token:`
			`params["pageToken"] = page_token`

			`result = service.files().list(**params).execute()`

			`files = result.get("files", [])`
			`next_token = result.get("nextPageToken")`

			`return files, next_token, None`

			`except HttpError as e:`
			`error_msg = f"HTTP error listing files: {e.resp.status} - {e.error_details}"`
			`return [], None, error_msg`
			`except Exception as e:`
			`return [], None, f"Error listing files: {e!s}"`

			`async def get_file_metadata(`
			`self, file_id: str, fields: str = "*"`
			`) -> tuple[dict[str, Any] \| None, str \| None]:`
			`"""`
			`Get metadata for a specific file.`

			`Args:`
			`file_id: ID of the file`
			`fields: Fields to retrieve`

			`Returns:`
			`Tuple of (file metadata, error message)`
			`"""`
			`try:`
			`service = await self.get_service()`
$DESKTOP-RTLN3BA\$punk$ feat: added circleback connector 2025-12-30 09:00:59 -08:00			`file = (`
			`service.files()`
			`.get(fileId=file_id, fields=fields, supportsAllDrives=True)`
			`.execute()`
			`)`
feat(connectors): add Google Drive API client wrapper - Build and manage Google Drive service with credentials - List files with query support and pagination - Download binary files and export Google Workspace files as PDF - Handle HTTP errors gracefully 2025-12-28 15:54:32 +02:00			`return file, None`
			`except HttpError as e:`
			`return None, f"HTTP error getting file metadata: {e.resp.status}"`
			`except Exception as e:`
			`return None, f"Error getting file metadata: {e!s}"`

$DESKTOP-RTLN3BA\$punk$ feat: added circleback connector 2025-12-30 09:00:59 -08:00			`async def download_file(self, file_id: str) -> tuple[bytes \| None, str \| None]:`
feat(connectors): add Google Drive API client wrapper - Build and manage Google Drive service with credentials - List files with query support and pagination - Download binary files and export Google Workspace files as PDF - Handle HTTP errors gracefully 2025-12-28 15:54:32 +02:00			`"""`
			`Download binary file content.`

			`Args:`
			`file_id: ID of the file to download`

			`Returns:`
			`Tuple of (file content bytes, error message)`
			`"""`
			`try:`
			`service = await self.get_service()`
			`request = service.files().get_media(fileId=file_id)`

			`import io`

			`fh = io.BytesIO()`
			`from googleapiclient.http import MediaIoBaseDownload`

			`downloader = MediaIoBaseDownload(fh, request)`

			`done = False`
			`while not done:`
			`_, done = downloader.next_chunk()`

			`return fh.getvalue(), None`

			`except HttpError as e:`
			`return None, f"HTTP error downloading file: {e.resp.status}"`
			`except Exception as e:`
			`return None, f"Error downloading file: {e!s}"`

			`async def export_google_file(`
			`self, file_id: str, mime_type: str`
			`) -> tuple[bytes \| None, str \| None]:`
			`"""`
			`Export Google Workspace file to specified format.`

			`Args:`
			`file_id: ID of the Google file`
			`mime_type: Target MIME type (e.g., 'application/pdf', 'text/plain')`

			`Returns:`
			`Tuple of (exported content as bytes, error message)`
			`"""`
			`try:`
			`service = await self.get_service()`
			`content = (`
$DESKTOP-RTLN3BA\$punk$ feat: added circleback connector 2025-12-30 09:00:59 -08:00			`service.files().export(fileId=file_id, mimeType=mime_type).execute()`
feat(connectors): add Google Drive API client wrapper - Build and manage Google Drive service with credentials - List files with query support and pagination - Download binary files and export Google Workspace files as PDF - Handle HTTP errors gracefully 2025-12-28 15:54:32 +02:00			`)`

			`# Content is already bytes from the API`
			`# Keep as bytes to support both text and binary formats (like PDF)`
			`if not isinstance(content, bytes):`
			`content = content.encode("utf-8")`

			`return content, None`

			`except HttpError as e:`
			`return None, f"HTTP error exporting file: {e.resp.status}"`
			`except Exception as e:`
			`return None, f"Error exporting file: {e!s}"`
add create_file and trash_file to GoogleDriveClient 2026-02-20 16:25:25 +02:00
			`async def create_file(`
			`self,`
			`name: str,`
			`mime_type: str,`
			`parent_folder_id: str \| None = None,`
			`content: str \| None = None,`
			`) -> dict[str, Any]:`
			`service = await self.get_service()`

			`body: dict[str, Any] = {"name": name, "mimeType": mime_type}`
			`if parent_folder_id:`
			`body["parents"] = [parent_folder_id]`

			`media: MediaIoBaseUpload \| None = None`
			`if content:`
			`if mime_type == GOOGLE_DOC:`
			`import markdown as md_lib`

			`html = md_lib.markdown(content)`
			`media = MediaIoBaseUpload(`
			`io.BytesIO(html.encode("utf-8")),`
			`mimetype="text/html",`
			`resumable=False,`
			`)`
			`elif mime_type == GOOGLE_SHEET:`
			`media = MediaIoBaseUpload(`
			`io.BytesIO(content.encode("utf-8")),`
			`mimetype="text/csv",`
			`resumable=False,`
			`)`

			`if media:`
			`return (`
			`service.files()`
			`.create(`
			`body=body,`
			`media_body=media,`
			`fields="id,name,mimeType,webViewLink",`
			`supportsAllDrives=True,`
			`)`
			`.execute()`
			`)`

			`return (`
			`service.files()`
			`.create(`
			`body=body,`
			`fields="id,name,mimeType,webViewLink",`
			`supportsAllDrives=True,`
			`)`
			`.execute()`
			`)`

			`async def trash_file(self, file_id: str) -> bool:`
			`service = await self.get_service()`
			`service.files().update(`
			`fileId=file_id,`
			`body={"trashed": True},`
			`supportsAllDrives=True,`
			`).execute()`
			`return True`