mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-04-27 17:56:25 +02:00
feat: add file streaming download functionality to Google Drive client
- Introduced `download_file_to_disk` method to stream files directly to disk in chunks, reducing memory usage during downloads. - Updated `download_and_extract_content` function to utilize the new streaming download method for binary files, enhancing efficiency in handling large files. - Improved error handling for download operations, providing clearer feedback on failures.
This commit is contained in:
parent
7c7f8b216c
commit
da6bbcfe39
2 changed files with 38 additions and 8 deletions
|
|
@ -172,6 +172,31 @@ class GoogleDriveClient:
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
return None, f"Error downloading file: {e!s}"
|
return None, f"Error downloading file: {e!s}"
|
||||||
|
|
||||||
|
async def download_file_to_disk(
|
||||||
|
self, file_id: str, dest_path: str, chunksize: int = 5 * 1024 * 1024,
|
||||||
|
) -> str | None:
|
||||||
|
"""Stream file directly to disk in chunks, avoiding full in-memory buffering.
|
||||||
|
|
||||||
|
Returns error message on failure, None on success.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
service = await self.get_service()
|
||||||
|
request = service.files().get_media(fileId=file_id)
|
||||||
|
from googleapiclient.http import MediaIoBaseDownload
|
||||||
|
|
||||||
|
with open(dest_path, "wb") as fh:
|
||||||
|
downloader = MediaIoBaseDownload(fh, request, chunksize=chunksize)
|
||||||
|
done = False
|
||||||
|
while not done:
|
||||||
|
_, done = downloader.next_chunk()
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
except HttpError as e:
|
||||||
|
return f"HTTP error downloading file: {e.resp.status}"
|
||||||
|
except Exception as e:
|
||||||
|
return f"Error downloading file: {e!s}"
|
||||||
|
|
||||||
async def export_google_file(
|
async def export_google_file(
|
||||||
self, file_id: str, mime_type: str
|
self, file_id: str, mime_type: str
|
||||||
) -> tuple[bytes | None, str | None]:
|
) -> tuple[bytes | None, str | None]:
|
||||||
|
|
|
||||||
|
|
@ -60,8 +60,9 @@ async def download_and_extract_content(
|
||||||
|
|
||||||
temp_file_path = None
|
temp_file_path = None
|
||||||
try:
|
try:
|
||||||
# Download / export
|
|
||||||
if is_google_workspace_file(mime_type):
|
if is_google_workspace_file(mime_type):
|
||||||
|
# Workspace files (Docs/Sheets/Slides) use export -- returns bytes
|
||||||
|
# in one shot. These are typically small (a few MB as PDF/text).
|
||||||
export_mime = get_export_mime_type(mime_type)
|
export_mime = get_export_mime_type(mime_type)
|
||||||
if not export_mime:
|
if not export_mime:
|
||||||
return None, drive_metadata, f"Cannot export Google Workspace type: {mime_type}"
|
return None, drive_metadata, f"Cannot export Google Workspace type: {mime_type}"
|
||||||
|
|
@ -69,17 +70,21 @@ async def download_and_extract_content(
|
||||||
if error:
|
if error:
|
||||||
return None, drive_metadata, error
|
return None, drive_metadata, error
|
||||||
extension = ".pdf" if export_mime == "application/pdf" else ".txt"
|
extension = ".pdf" if export_mime == "application/pdf" else ".txt"
|
||||||
else:
|
|
||||||
content_bytes, error = await client.download_file(file_id)
|
|
||||||
if error:
|
|
||||||
return None, drive_metadata, error
|
|
||||||
extension = Path(file_name).suffix or ".bin"
|
|
||||||
|
|
||||||
with tempfile.NamedTemporaryFile(delete=False, suffix=extension) as tmp:
|
with tempfile.NamedTemporaryFile(delete=False, suffix=extension) as tmp:
|
||||||
tmp.write(content_bytes)
|
tmp.write(content_bytes)
|
||||||
temp_file_path = tmp.name
|
temp_file_path = tmp.name
|
||||||
|
else:
|
||||||
|
# Binary files -- stream directly to disk in chunks to avoid
|
||||||
|
# loading the entire file into memory.
|
||||||
|
extension = Path(file_name).suffix or ".bin"
|
||||||
|
with tempfile.NamedTemporaryFile(delete=False, suffix=extension) as tmp:
|
||||||
|
temp_file_path = tmp.name
|
||||||
|
|
||||||
|
error = await client.download_file_to_disk(file_id, temp_file_path)
|
||||||
|
if error:
|
||||||
|
return None, drive_metadata, error
|
||||||
|
|
||||||
# Parse to markdown
|
|
||||||
markdown = await _parse_file_to_markdown(temp_file_path, file_name)
|
markdown = await _parse_file_to_markdown(temp_file_path, file_name)
|
||||||
return markdown, drive_metadata, None
|
return markdown, drive_metadata, None
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue