From 9f1fd20944d46a9475ec68b826addcfb3ce61f6c Mon Sep 17 00:00:00 2001 From: CREDO23 Date: Sun, 28 Dec 2025 16:55:14 +0200 Subject: [PATCH] feat(connectors): mark Google Drive documents with GOOGLE_DRIVE_CONNECTOR type - Change document_type from file type (PDF, DOCX) to GOOGLE_DRIVE_CONNECTOR - Store original file type in metadata for reference - Add Google Drive specific metadata (file_id, mime_type, source) - Include export format info for Google Workspace files - Enables proper source tracking and bulk management --- .../google_drive/content_extractor.py | 34 +++++++++++++++++-- 1 file changed, 32 insertions(+), 2 deletions(-) diff --git a/surfsense_backend/app/connectors/google_drive/content_extractor.py b/surfsense_backend/app/connectors/google_drive/content_extractor.py index 82b8d42b3..88aca8f46 100644 --- a/surfsense_backend/app/connectors/google_drive/content_extractor.py +++ b/surfsense_backend/app/connectors/google_drive/content_extractor.py @@ -94,7 +94,7 @@ async def download_and_process_file( ) logger.info(f"Processing {file_name} with Surfsense's file processor") - result = await process_file_in_background( + document = await process_file_in_background( file_path=temp_file_path, filename=file_name, search_space_id=search_space_id, @@ -104,8 +104,38 @@ async def download_and_process_file( log_entry=log_entry, ) + # Step 3: Update document type to GOOGLE_DRIVE_CONNECTOR and add metadata + if document: + from app.db import DocumentType + + # Store original file type in metadata before changing document_type + original_type = document.document_type + + # Update document type to mark it as from Google Drive + document.document_type = DocumentType.GOOGLE_DRIVE_CONNECTOR + + # Add Google Drive specific metadata + if not document.metadata: + document.metadata = {} + + document.metadata.update({ + "google_drive_file_id": file_id, + "google_drive_file_name": file_name, + "google_drive_mime_type": mime_type, + "original_document_type": original_type, + "source_connector": "google_drive", + }) + + # If it was a Google Workspace file, note the export format + if is_google_workspace_file(mime_type): + document.metadata["exported_as"] = "pdf" + document.metadata["original_workspace_type"] = mime_type.split(".")[-1] # e.g., "document", "spreadsheet" + + await session.flush() # Persist the changes + logger.info(f"Updated document type to GOOGLE_DRIVE_CONNECTOR for {file_name}") + # process_file_in_background returns None on duplicate/error, Document on success - return result, None + return document, None except Exception as e: logger.warning(f"Failed to process {file_name}: {e!s}")