diff --git a/surfsense_backend/app/connectors/google_drive/client.py b/surfsense_backend/app/connectors/google_drive/client.py index 697e3e760..4e4240e91 100644 --- a/surfsense_backend/app/connectors/google_drive/client.py +++ b/surfsense_backend/app/connectors/google_drive/client.py @@ -172,6 +172,31 @@ class GoogleDriveClient: except Exception as e: return None, f"Error downloading file: {e!s}" + async def download_file_to_disk( + self, file_id: str, dest_path: str, chunksize: int = 5 * 1024 * 1024, + ) -> str | None: + """Stream file directly to disk in chunks, avoiding full in-memory buffering. + + Returns error message on failure, None on success. + """ + try: + service = await self.get_service() + request = service.files().get_media(fileId=file_id) + from googleapiclient.http import MediaIoBaseDownload + + with open(dest_path, "wb") as fh: + downloader = MediaIoBaseDownload(fh, request, chunksize=chunksize) + done = False + while not done: + _, done = downloader.next_chunk() + + return None + + except HttpError as e: + return f"HTTP error downloading file: {e.resp.status}" + except Exception as e: + return f"Error downloading file: {e!s}" + async def export_google_file( self, file_id: str, mime_type: str ) -> tuple[bytes | None, str | None]: diff --git a/surfsense_backend/app/connectors/google_drive/content_extractor.py b/surfsense_backend/app/connectors/google_drive/content_extractor.py index 6fa20bf8e..69f64d9ae 100644 --- a/surfsense_backend/app/connectors/google_drive/content_extractor.py +++ b/surfsense_backend/app/connectors/google_drive/content_extractor.py @@ -60,8 +60,9 @@ async def download_and_extract_content( temp_file_path = None try: - # Download / export if is_google_workspace_file(mime_type): + # Workspace files (Docs/Sheets/Slides) use export -- returns bytes + # in one shot. These are typically small (a few MB as PDF/text). export_mime = get_export_mime_type(mime_type) if not export_mime: return None, drive_metadata, f"Cannot export Google Workspace type: {mime_type}" @@ -69,17 +70,21 @@ async def download_and_extract_content( if error: return None, drive_metadata, error extension = ".pdf" if export_mime == "application/pdf" else ".txt" + + with tempfile.NamedTemporaryFile(delete=False, suffix=extension) as tmp: + tmp.write(content_bytes) + temp_file_path = tmp.name else: - content_bytes, error = await client.download_file(file_id) + # Binary files -- stream directly to disk in chunks to avoid + # loading the entire file into memory. + extension = Path(file_name).suffix or ".bin" + with tempfile.NamedTemporaryFile(delete=False, suffix=extension) as tmp: + temp_file_path = tmp.name + + error = await client.download_file_to_disk(file_id, temp_file_path) if error: return None, drive_metadata, error - extension = Path(file_name).suffix or ".bin" - with tempfile.NamedTemporaryFile(delete=False, suffix=extension) as tmp: - tmp.write(content_bytes) - temp_file_path = tmp.name - - # Parse to markdown markdown = await _parse_file_to_markdown(temp_file_path, file_name) return markdown, drive_metadata, None