feat: add file streaming download functionality to Google Drive client

- Introduced `download_file_to_disk` method to stream files directly to disk in chunks, reducing memory usage during downloads.
- Updated `download_and_extract_content` function to utilize the new streaming download method for binary files, enhancing efficiency in handling large files.
- Improved error handling for download operations, providing clearer feedback on failures.
This commit is contained in:
Anish Sarkar 2026-03-27 08:54:06 +05:30
parent 7c7f8b216c
commit da6bbcfe39
2 changed files with 38 additions and 8 deletions

View file

@ -172,6 +172,31 @@ class GoogleDriveClient:
except Exception as e: except Exception as e:
return None, f"Error downloading file: {e!s}" return None, f"Error downloading file: {e!s}"
async def download_file_to_disk(
self, file_id: str, dest_path: str, chunksize: int = 5 * 1024 * 1024,
) -> str | None:
"""Stream file directly to disk in chunks, avoiding full in-memory buffering.
Returns error message on failure, None on success.
"""
try:
service = await self.get_service()
request = service.files().get_media(fileId=file_id)
from googleapiclient.http import MediaIoBaseDownload
with open(dest_path, "wb") as fh:
downloader = MediaIoBaseDownload(fh, request, chunksize=chunksize)
done = False
while not done:
_, done = downloader.next_chunk()
return None
except HttpError as e:
return f"HTTP error downloading file: {e.resp.status}"
except Exception as e:
return f"Error downloading file: {e!s}"
async def export_google_file( async def export_google_file(
self, file_id: str, mime_type: str self, file_id: str, mime_type: str
) -> tuple[bytes | None, str | None]: ) -> tuple[bytes | None, str | None]:

View file

@ -60,8 +60,9 @@ async def download_and_extract_content(
temp_file_path = None temp_file_path = None
try: try:
# Download / export
if is_google_workspace_file(mime_type): if is_google_workspace_file(mime_type):
# Workspace files (Docs/Sheets/Slides) use export -- returns bytes
# in one shot. These are typically small (a few MB as PDF/text).
export_mime = get_export_mime_type(mime_type) export_mime = get_export_mime_type(mime_type)
if not export_mime: if not export_mime:
return None, drive_metadata, f"Cannot export Google Workspace type: {mime_type}" return None, drive_metadata, f"Cannot export Google Workspace type: {mime_type}"
@ -69,17 +70,21 @@ async def download_and_extract_content(
if error: if error:
return None, drive_metadata, error return None, drive_metadata, error
extension = ".pdf" if export_mime == "application/pdf" else ".txt" extension = ".pdf" if export_mime == "application/pdf" else ".txt"
else:
content_bytes, error = await client.download_file(file_id)
if error:
return None, drive_metadata, error
extension = Path(file_name).suffix or ".bin"
with tempfile.NamedTemporaryFile(delete=False, suffix=extension) as tmp: with tempfile.NamedTemporaryFile(delete=False, suffix=extension) as tmp:
tmp.write(content_bytes) tmp.write(content_bytes)
temp_file_path = tmp.name temp_file_path = tmp.name
else:
# Binary files -- stream directly to disk in chunks to avoid
# loading the entire file into memory.
extension = Path(file_name).suffix or ".bin"
with tempfile.NamedTemporaryFile(delete=False, suffix=extension) as tmp:
temp_file_path = tmp.name
error = await client.download_file_to_disk(file_id, temp_file_path)
if error:
return None, drive_metadata, error
# Parse to markdown
markdown = await _parse_file_to_markdown(temp_file_path, file_name) markdown = await _parse_file_to_markdown(temp_file_path, file_name)
return markdown, drive_metadata, None return markdown, drive_metadata, None