feat: add file streaming download functionality to Google Drive client

- Introduced `download_file_to_disk` method to stream files directly to disk in chunks, reducing memory usage during downloads. - Updated `download_and_extract_content` function to utilize the new streaming download method for binary files, enhancing efficiency in handling large files. - Improved error handling for download operations, providing clearer feedback on failures.
2026-06-30 21:59:46 +02:00 · 2026-03-27 08:54:06 +05:30 · 2026-03-27 08:54:06 +05:30 · da6bbcfe39
commit da6bbcfe39
parent 7c7f8b216c
2 changed files with 38 additions and 8 deletions
--- a/surfsense_backend/app/connectors/google_drive/client.py
+++ b/surfsense_backend/app/connectors/google_drive/client.py
@ -172,6 +172,31 @@ class GoogleDriveClient:
        except Exception as e:
            return None, f"Error downloading file: {e!s}"
    async def download_file_to_disk(
        self, file_id: str, dest_path: str, chunksize: int = 5 * 1024 * 1024,
    ) -> str | None:
        """Stream file directly to disk in chunks, avoiding full in-memory buffering.
        Returns error message on failure, None on success.
        """
        try:
            service = await self.get_service()
            request = service.files().get_media(fileId=file_id)
            from googleapiclient.http import MediaIoBaseDownload
            with open(dest_path, "wb") as fh:
                downloader = MediaIoBaseDownload(fh, request, chunksize=chunksize)
                done = False
                while not done:
                    _, done = downloader.next_chunk()
            return None
        except HttpError as e:
            return f"HTTP error downloading file: {e.resp.status}"
        except Exception as e:
            return f"Error downloading file: {e!s}"
    async def export_google_file(
        self, file_id: str, mime_type: str
    ) -> tuple[bytes | None, str | None]:
--- a/surfsense_backend/app/connectors/google_drive/content_extractor.py
+++ b/surfsense_backend/app/connectors/google_drive/content_extractor.py
@ -60,8 +60,9 @@ async def download_and_extract_content(
    temp_file_path = None
    try:
        # Download / export
        if is_google_workspace_file(mime_type):
            # Workspace files (Docs/Sheets/Slides) use export -- returns bytes
            # in one shot. These are typically small (a few MB as PDF/text).
            export_mime = get_export_mime_type(mime_type)
            if not export_mime:
                return None, drive_metadata, f"Cannot export Google Workspace type: {mime_type}"
@ -69,17 +70,21 @@ async def download_and_extract_content(
            if error:
                return None, drive_metadata, error
            extension = ".pdf" if export_mime == "application/pdf" else ".txt"
        else:
            content_bytes, error = await client.download_file(file_id)
            if error:
                return None, drive_metadata, error
            extension = Path(file_name).suffix or ".bin"
            with tempfile.NamedTemporaryFile(delete=False, suffix=extension) as tmp:
                tmp.write(content_bytes)
                temp_file_path = tmp.name
        else:
            # Binary files -- stream directly to disk in chunks to avoid
            # loading the entire file into memory.
            extension = Path(file_name).suffix or ".bin"
            with tempfile.NamedTemporaryFile(delete=False, suffix=extension) as tmp:
                temp_file_path = tmp.name
            error = await client.download_file_to_disk(file_id, temp_file_path)
            if error:
                return None, drive_metadata, error
        # Parse to markdown
        markdown = await _parse_file_to_markdown(temp_file_path, file_name)
        return markdown, drive_metadata, None