From acf47e3b0cb6b4ba24defee4d38f07b10abad493 Mon Sep 17 00:00:00 2001 From: CREDO23 Date: Sun, 28 Dec 2025 18:53:13 +0200 Subject: [PATCH] refactor(connectors): remove verbose docstrings and obvious comments - Simplify module docstrings (remove meta-commentary about 'small focused modules') - Remove redundant inline comments (e.g., 'Log task start', 'Get connector from database') - Trim verbose function docstrings to essential information only - Remove over-explanatory comments that restate what code does - Keep necessary documentation, remove noise for better readability --- .../app/connectors/google_drive/__init__.py | 6 +--- .../connectors/google_drive/change_tracker.py | 10 +----- .../app/connectors/google_drive/client.py | 15 ++------- .../google_drive/content_extractor.py | 20 ++---------- .../connectors/google_drive/credentials.py | 13 +------- .../app/connectors/google_drive/file_types.py | 9 +----- .../connectors/google_drive/folder_manager.py | 17 ++-------- .../google_drive_indexer.py | 32 +------------------ 8 files changed, 12 insertions(+), 110 deletions(-) diff --git a/surfsense_backend/app/connectors/google_drive/__init__.py b/surfsense_backend/app/connectors/google_drive/__init__.py index c50135155..6e0d25725 100644 --- a/surfsense_backend/app/connectors/google_drive/__init__.py +++ b/surfsense_backend/app/connectors/google_drive/__init__.py @@ -1,8 +1,4 @@ -""" -Google Drive Connector Module. - -Simple, modular approach to Google Drive indexing. -""" +"""Google Drive Connector Module.""" from .change_tracker import categorize_change, fetch_all_changes, get_start_page_token from .client import GoogleDriveClient diff --git a/surfsense_backend/app/connectors/google_drive/change_tracker.py b/surfsense_backend/app/connectors/google_drive/change_tracker.py index 1c697af5f..860e2dbef 100644 --- a/surfsense_backend/app/connectors/google_drive/change_tracker.py +++ b/surfsense_backend/app/connectors/google_drive/change_tracker.py @@ -1,9 +1,4 @@ -""" -Change Tracking for Google Drive - Delta Sync Support. - -Handles change detection and incremental syncing using Drive API's changes endpoint. -Small, focused module for tracking file modifications. -""" +"""Change tracking for Google Drive delta sync.""" import logging from datetime import datetime @@ -110,7 +105,6 @@ async def _filter_changes_by_folder( for change in changes: file = change.get("file") if not file: - # File was removed filtered.append(change) continue @@ -147,7 +141,6 @@ def categorize_change(change: dict[str, Any]) -> str: if file.get("trashed"): return "trashed" - # Check if file was recently created created_time = file.get("createdTime") modified_time = file.get("modifiedTime") @@ -198,7 +191,6 @@ async def fetch_all_changes( all_changes.extend(changes) - # If next_token is None, we've reached the end if not next_token or next_token == current_token: break diff --git a/surfsense_backend/app/connectors/google_drive/client.py b/surfsense_backend/app/connectors/google_drive/client.py index 6d2d0abfd..5053aa449 100644 --- a/surfsense_backend/app/connectors/google_drive/client.py +++ b/surfsense_backend/app/connectors/google_drive/client.py @@ -1,9 +1,4 @@ -""" -Google Drive API Client. - -Core client for interacting with Google Drive API. -Handles service initialization and basic file operations. -""" +"""Google Drive API client.""" from typing import Any @@ -16,12 +11,7 @@ from .credentials import get_valid_credentials class GoogleDriveClient: - """ - Main client for Google Drive API operations. - - Handles service initialization and provides methods for - listing files, getting metadata, and downloading content. - """ + """Client for Google Drive API operations.""" def __init__(self, session: AsyncSession, connector_id: int): """ @@ -140,7 +130,6 @@ class GoogleDriveClient: service = await self.get_service() request = service.files().get_media(fileId=file_id) - # Execute the download import io fh = io.BytesIO() diff --git a/surfsense_backend/app/connectors/google_drive/content_extractor.py b/surfsense_backend/app/connectors/google_drive/content_extractor.py index 04c48f47f..00211957a 100644 --- a/surfsense_backend/app/connectors/google_drive/content_extractor.py +++ b/surfsense_backend/app/connectors/google_drive/content_extractor.py @@ -1,8 +1,4 @@ -""" -Content Extraction for Google Drive Files. - -Downloads files and delegates to Surfsense's existing file processors. -""" +"""Content extraction for Google Drive files.""" import logging import os @@ -31,9 +27,7 @@ async def download_and_process_file( log_entry: Log, ) -> tuple[Any, str | None, dict[str, Any] | None]: """ - Download Google Drive file and process using Surfsense's existing infrastructure. - - This is the ONLY function needed - it delegates everything to process_file_in_background. + Download Google Drive file and process using Surfsense file processors. Args: client: GoogleDriveClient instance @@ -71,10 +65,8 @@ async def download_and_process_file( if error: return None, error - # Set extension based on export format extension = ".pdf" if export_mime == "application/pdf" else ".txt" else: - # Regular files - download directly content_bytes, error = await client.download_file(file_id) if error: return None, error @@ -82,19 +74,15 @@ async def download_and_process_file( # Preserve original file extension extension = Path(file_name).suffix or ".bin" - # Save to temporary file with tempfile.NamedTemporaryFile(delete=False, suffix=extension) as tmp_file: tmp_file.write(content_bytes) temp_file_path = tmp_file.name - # Step 2: Delegate to Surfsense's existing file processor - # This handles ALL file types: markdown, audio, PDFs, Office docs, images, etc. from app.tasks.document_processors.file_processors import ( process_file_in_background, ) from app.db import DocumentType - # Prepare connector info connector_info = { "type": DocumentType.GOOGLE_DRIVE_CONNECTOR, "metadata": { @@ -105,7 +93,6 @@ async def download_and_process_file( }, } - # If it was a Google Workspace file, note the export format if is_google_workspace_file(mime_type): connector_info["metadata"]["exported_as"] = "pdf" connector_info["metadata"]["original_workspace_type"] = mime_type.split(".")[-1] @@ -119,10 +106,9 @@ async def download_and_process_file( session=session, task_logger=task_logger, log_entry=log_entry, - connector=connector_info, # Pass connector info + connector=connector_info, ) - # process_file_in_background doesn't return the document return None, None, connector_info["metadata"] except Exception as e: diff --git a/surfsense_backend/app/connectors/google_drive/credentials.py b/surfsense_backend/app/connectors/google_drive/credentials.py index 5d09df881..4c1ef9c03 100644 --- a/surfsense_backend/app/connectors/google_drive/credentials.py +++ b/surfsense_backend/app/connectors/google_drive/credentials.py @@ -1,9 +1,4 @@ -""" -Google Drive OAuth Credentials Management. - -Handles credential validation, token refresh, and persistence to database. -Small, focused module for credential operations only. -""" +"""Google Drive OAuth credential management.""" import json from datetime import datetime @@ -35,7 +30,6 @@ async def get_valid_credentials( ValueError: If credentials are missing or invalid Exception: If token refresh fails """ - # Fetch connector from database result = await session.execute( select(SearchSourceConnector).filter( SearchSourceConnector.id == connector_id @@ -46,11 +40,9 @@ async def get_valid_credentials( if not connector: raise ValueError(f"Connector {connector_id} not found") - # Extract credentials from config config_data = connector.config exp = config_data.get("expiry", "").replace("Z", "") - # Validate required fields if not all( [ config_data.get("client_id"), @@ -62,7 +54,6 @@ async def get_valid_credentials( "Google OAuth credentials (client_id, client_secret, refresh_token) must be set" ) - # Create credentials object credentials = Credentials( token=config_data.get("token"), refresh_token=config_data.get("refresh_token"), @@ -73,12 +64,10 @@ async def get_valid_credentials( expiry=datetime.fromisoformat(exp) if exp else None, ) - # Refresh token if expired if credentials.expired or not credentials.valid: try: credentials.refresh(Request()) - # Persist refreshed token to database connector.config = json.loads(credentials.to_json()) flag_modified(connector, "config") await session.commit() diff --git a/surfsense_backend/app/connectors/google_drive/file_types.py b/surfsense_backend/app/connectors/google_drive/file_types.py index f66680c6c..cb2354585 100644 --- a/surfsense_backend/app/connectors/google_drive/file_types.py +++ b/surfsense_backend/app/connectors/google_drive/file_types.py @@ -1,18 +1,11 @@ -""" -File Type Handlers for Google Drive. +"""File type handlers for Google Drive.""" -Simple module for basic file type detection. -""" - -# Google Workspace MIME types that need export GOOGLE_DOC = "application/vnd.google-apps.document" GOOGLE_SHEET = "application/vnd.google-apps.spreadsheet" GOOGLE_SLIDE = "application/vnd.google-apps.presentation" GOOGLE_FOLDER = "application/vnd.google-apps.folder" GOOGLE_SHORTCUT = "application/vnd.google-apps.shortcut" -# Export MIME types for Google Workspace files -# Export as PDF to preserve formatting, images, and structure EXPORT_FORMATS = { GOOGLE_DOC: "application/pdf", GOOGLE_SHEET: "application/pdf", diff --git a/surfsense_backend/app/connectors/google_drive/folder_manager.py b/surfsense_backend/app/connectors/google_drive/folder_manager.py index da9deb75d..599475a46 100644 --- a/surfsense_backend/app/connectors/google_drive/folder_manager.py +++ b/surfsense_backend/app/connectors/google_drive/folder_manager.py @@ -1,9 +1,4 @@ -""" -Folder Management for Google Drive. - -Handles folder listing, selection, and hierarchy operations. -Small, focused module for folder-related operations. -""" +"""Folder management for Google Drive.""" import logging from typing import Any @@ -165,11 +160,7 @@ async def list_folder_contents( parent_id: str | None = None, ) -> tuple[list[dict[str, Any]], str | None]: """ - List both folders and files in a Google Drive folder. - - Fetches ALL items using pagination (handles folders with >100 items). - Returns items sorted with folders first, then files. - Each item includes 'isFolder' boolean for frontend rendering. + List folders and files in a Google Drive folder with pagination support. Args: client: GoogleDriveClient instance @@ -212,20 +203,16 @@ async def list_folder_contents( all_items.extend(items) - # If no more pages, break if not next_token: break page_token = next_token - # Add 'isFolder' flag and sort (folders first, then files) for item in all_items: item["isFolder"] = item["mimeType"] == "application/vnd.google-apps.folder" - # Sort: folders first (alphabetically), then files (alphabetically) all_items.sort(key=lambda x: (not x["isFolder"], x["name"].lower())) - # Count folders and files for logging folder_count = sum(1 for item in all_items if item["isFolder"]) file_count = len(all_items) - folder_count diff --git a/surfsense_backend/app/tasks/connector_indexers/google_drive_indexer.py b/surfsense_backend/app/tasks/connector_indexers/google_drive_indexer.py index 335c3b41d..cd862e372 100644 --- a/surfsense_backend/app/tasks/connector_indexers/google_drive_indexer.py +++ b/surfsense_backend/app/tasks/connector_indexers/google_drive_indexer.py @@ -1,11 +1,4 @@ -""" -Google Drive Indexer - Delegates all processing to Surfsense's file processors. - -Handles: -- Folder-specific indexing (user selects folder) -- Delta sync (only index changed files) -- Delegates file processing to process_file_in_background -""" +"""Google Drive indexer using Surfsense file processors.""" import logging from datetime import datetime @@ -63,7 +56,6 @@ async def index_google_drive_files( """ task_logger = TaskLoggingService(session, search_space_id) - # Log task start log_entry = await task_logger.log_task_start( task_name="google_drive_files_indexing", source="connector_indexing_task", @@ -78,7 +70,6 @@ async def index_google_drive_files( ) try: - # Get connector from database connector = await get_connector_by_id( session, connector_id, SearchSourceConnectorType.GOOGLE_DRIVE_CONNECTOR ) @@ -90,7 +81,6 @@ async def index_google_drive_files( ) return 0, error_msg - # Initialize Drive client await task_logger.log_task_progress( log_entry, f"Initializing Google Drive client for connector {connector_id}", @@ -99,7 +89,6 @@ async def index_google_drive_files( drive_client = GoogleDriveClient(session, connector_id) - # Use folder from request params (required for Google Drive) if not folder_id: error_msg = "folder_id is required for Google Drive indexing" await task_logger.log_task_failure( @@ -112,7 +101,6 @@ async def index_google_drive_files( logger.info(f"Indexing Google Drive folder: {target_folder_name} ({target_folder_id})") - # Decide sync strategy - track tokens per folder folder_tokens = connector.config.get("folder_tokens", {}) start_page_token = folder_tokens.get(target_folder_id) can_use_delta_sync = use_delta_sync and start_page_token and connector.last_indexed_at @@ -150,14 +138,11 @@ async def index_google_drive_files( documents_indexed, documents_skipped = result - # Update last indexed timestamp and get new start page token if documents_indexed > 0 or can_use_delta_sync: - # Get new start page token for next sync new_token, token_error = await get_start_page_token(drive_client) if new_token and not token_error: from sqlalchemy.orm.attributes import flag_modified - # Store token per folder if "folder_tokens" not in connector.config: connector.config["folder_tokens"] = {} connector.config["folder_tokens"][target_folder_id] = new_token @@ -165,13 +150,11 @@ async def index_google_drive_files( await update_connector_last_indexed(session, connector, update_last_indexed) - # Final commit await session.commit() logger.info( f"Successfully committed Google Drive indexing changes to database" ) - # Log success await task_logger.log_task_success( log_entry, f"Successfully completed Google Drive indexing for connector {connector_id}", @@ -235,7 +218,6 @@ async def _index_full_scan( page_token = None files_processed = 0 - # Paginate through all files in folder while files_processed < max_files: files, next_token, error = await get_files_in_folder( drive_client, folder_id, include_subfolders=False, page_token=page_token @@ -254,7 +236,6 @@ async def _index_full_scan( files_processed += 1 - # Process file indexed, skipped = await _process_single_file( drive_client=drive_client, session=session, @@ -269,7 +250,6 @@ async def _index_full_scan( documents_indexed += indexed documents_skipped += skipped - # Batch commit every 10 files if documents_indexed % 10 == 0 and documents_indexed > 0: await session.commit() logger.info(f"Committed batch: {documents_indexed} files indexed so far") @@ -304,7 +284,6 @@ async def _index_with_delta_sync( {"stage": "delta_sync", "start_token": start_page_token}, ) - # Fetch all changes since last sync changes, final_token, error = await fetch_all_changes( drive_client, start_page_token, folder_id ) @@ -330,14 +309,12 @@ async def _index_with_delta_sync( files_processed += 1 change_type = categorize_change(change) - # Handle removed/trashed files if change_type in ["removed", "trashed"]: file_id = change.get("fileId") if file_id: await _remove_document(session, file_id, search_space_id) continue - # Handle modified/new files file = change.get("file") if not file: continue @@ -356,7 +333,6 @@ async def _index_with_delta_sync( documents_indexed += indexed documents_skipped += skipped - # Batch commit every 10 files if documents_indexed % 10 == 0 and documents_indexed > 0: await session.commit() logger.info(f"Committed batch: {documents_indexed} changes processed") @@ -389,10 +365,6 @@ async def _process_single_file( try: logger.info(f"Processing file: {file_name} ({mime_type})") - # Download and process using Surfsense's existing infrastructure - # This handles: markdown, audio, PDFs, Office docs, images, etc. - # It also handles: deduplication, chunking, summarization, embedding - # Document type is set to GOOGLE_DRIVE_CONNECTOR during processing _, error, _ = await download_and_process_file( client=drive_client, file=file, @@ -404,7 +376,6 @@ async def _process_single_file( ) if error: - # Log and skip - not an error, just unsupported or empty await task_logger.log_task_progress( log_entry, f"Skipped {file_name}: {error}", @@ -412,7 +383,6 @@ async def _process_single_file( ) return 0, 1 - # File was processed successfully (document type already set in processor) logger.info(f"Successfully indexed Google Drive file: {file_name}") return 1, 0