feat: enhance Google connectors indexing with content extraction and document migration

- Added `download_and_extract_content` function to extract content from Google Drive files as markdown. - Updated Google Drive indexer to utilize the new content extraction method. - Implemented document migration logic to update legacy Composio document types to their native Google types. - Introduced identifier hashing for stable document identification. - Improved file pre-filtering to handle unchanged and rename-only files efficiently.
2026-05-27 19:25:15 +02:00 · 2026-03-25 18:33:44 +05:30 · 2026-03-25 18:33:44 +05:30 · f7b52470eb
commit f7b52470eb
parent 2da6fd89ea
8 changed files with 951 additions and 1588 deletions
--- a/surfsense_backend/app/connectors/google_drive/init.py
+++ b/surfsense_backend/app/connectors/google_drive/init.py
@ -2,13 +2,14 @@
 from .change_tracker import categorize_change, fetch_all_changes, get_start_page_token
 from .client import GoogleDriveClient
-from .content_extractor import download_and_process_file
+from .content_extractor import download_and_extract_content, download_and_process_file
 from .credentials import get_valid_credentials, validate_credentials
 from .folder_manager import get_file_by_id, get_files_in_folder, list_folder_contents
 __all__ = [
    "GoogleDriveClient",
    "categorize_change",
    "download_and_extract_content",
    "download_and_process_file",
    "fetch_all_changes",
    "get_file_by_id",
--- a/surfsense_backend/app/connectors/google_drive/content_extractor.py
+++ b/surfsense_backend/app/connectors/google_drive/content_extractor.py
@ -17,6 +17,160 @@ from .file_types import get_export_mime_type, is_google_workspace_file, should_s
 logger = logging.getLogger(__name__)
 async def download_and_extract_content(
    client: GoogleDriveClient,
    file: dict[str, Any],
 ) -> tuple[str | None, dict[str, Any], str | None]:
    """Download a Google Drive file and extract its content as markdown.
    ETL only -- no DB writes, no indexing, no summarization.
    Returns:
        (markdown_content, drive_metadata, error_message)
        On success error_message is None.
    """
    file_id = file.get("id")
    file_name = file.get("name", "Unknown")
    mime_type = file.get("mimeType", "")
    if should_skip_file(mime_type):
        return None, {}, f"Skipping {mime_type}"
    logger.info(f"Downloading file for content extraction: {file_name} ({mime_type})")
    drive_metadata: dict[str, Any] = {
        "google_drive_file_id": file_id,
        "google_drive_file_name": file_name,
        "google_drive_mime_type": mime_type,
        "source_connector": "google_drive",
    }
    if "modifiedTime" in file:
        drive_metadata["modified_time"] = file["modifiedTime"]
    if "createdTime" in file:
        drive_metadata["created_time"] = file["createdTime"]
    if "size" in file:
        drive_metadata["file_size"] = file["size"]
    if "webViewLink" in file:
        drive_metadata["web_view_link"] = file["webViewLink"]
    if "md5Checksum" in file:
        drive_metadata["md5_checksum"] = file["md5Checksum"]
    if is_google_workspace_file(mime_type):
        drive_metadata["exported_as"] = "pdf"
        drive_metadata["original_workspace_type"] = mime_type.split(".")[-1]
    temp_file_path = None
    try:
        # Download / export
        if is_google_workspace_file(mime_type):
            export_mime = get_export_mime_type(mime_type)
            if not export_mime:
                return None, drive_metadata, f"Cannot export Google Workspace type: {mime_type}"
            content_bytes, error = await client.export_google_file(file_id, export_mime)
            if error:
                return None, drive_metadata, error
            extension = ".pdf" if export_mime == "application/pdf" else ".txt"
        else:
            content_bytes, error = await client.download_file(file_id)
            if error:
                return None, drive_metadata, error
            extension = Path(file_name).suffix or ".bin"
        with tempfile.NamedTemporaryFile(delete=False, suffix=extension) as tmp:
            tmp.write(content_bytes)
            temp_file_path = tmp.name
        # Parse to markdown
        markdown = await _parse_file_to_markdown(temp_file_path, file_name)
        return markdown, drive_metadata, None
    except Exception as e:
        logger.warning(f"Failed to extract content from {file_name}: {e!s}")
        return None, drive_metadata, str(e)
    finally:
        if temp_file_path and os.path.exists(temp_file_path):
            try:
                os.unlink(temp_file_path)
            except Exception:
                pass
 async def _parse_file_to_markdown(file_path: str, filename: str) -> str:
    """Parse a local file to markdown using the configured ETL service."""
    lower = filename.lower()
    if lower.endswith((".md", ".markdown", ".txt")):
        with open(file_path, encoding="utf-8") as f:
            return f.read()
    if lower.endswith((".mp3", ".mp4", ".mpeg", ".mpga", ".m4a", ".wav", ".webm")):
        from app.config import config as app_config
        from litellm import atranscription
        stt_service_type = (
            "local"
            if app_config.STT_SERVICE and app_config.STT_SERVICE.startswith("local/")
            else "external"
        )
        if stt_service_type == "local":
            from app.services.stt_service import stt_service
            result = stt_service.transcribe_file(file_path)
            text = result.get("text", "")
        else:
            with open(file_path, "rb") as audio_file:
                kwargs: dict[str, Any] = {
                    "model": app_config.STT_SERVICE,
                    "file": audio_file,
                    "api_key": app_config.STT_SERVICE_API_KEY,
                }
                if app_config.STT_SERVICE_API_BASE:
                    kwargs["api_base"] = app_config.STT_SERVICE_API_BASE
                resp = await atranscription(**kwargs)
                text = resp.get("text", "")
        if not text:
            raise ValueError("Transcription returned empty text")
        return f"# Transcription of {filename}\n\n{text}"
    # Document files -- use configured ETL service
    from app.config import config as app_config
    if app_config.ETL_SERVICE == "UNSTRUCTURED":
        from langchain_unstructured import UnstructuredLoader
        from app.utils.document_converters import convert_document_to_markdown
        loader = UnstructuredLoader(
            file_path,
            mode="elements",
            post_processors=[],
            languages=["eng"],
            include_orig_elements=False,
            include_metadata=False,
            strategy="auto",
        )
        docs = await loader.aload()
        return await convert_document_to_markdown(docs)
    if app_config.ETL_SERVICE == "LLAMACLOUD":
        from app.tasks.document_processors.file_processors import (
            parse_with_llamacloud_retry,
        )
        result = await parse_with_llamacloud_retry(file_path=file_path, estimated_pages=50)
        markdown_documents = await result.aget_markdown_documents(split_by_page=False)
        if not markdown_documents:
            raise RuntimeError(f"LlamaCloud returned no documents for {filename}")
        return markdown_documents[0].text
    if app_config.ETL_SERVICE == "DOCLING":
        from docling.document_converter import DocumentConverter
        converter = DocumentConverter()
        result = converter.convert(file_path)
        return result.document.export_to_markdown()
    raise RuntimeError(f"Unknown ETL_SERVICE: {app_config.ETL_SERVICE}")
 async def download_and_process_file(
    client: GoogleDriveClient,
    file: dict[str, Any],
--- a/surfsense_backend/app/indexing_pipeline/document_hashing.py
+++ b/surfsense_backend/app/indexing_pipeline/document_hashing.py
@ -3,10 +3,17 @@ import hashlib
 from app.indexing_pipeline.connector_document import ConnectorDocument
 def compute_identifier_hash(
    document_type_value: str, unique_id: str, search_space_id: int
 ) -> str:
    """Return a stable SHA-256 hash from raw identity components."""
    combined = f"{document_type_value}:{unique_id}:{search_space_id}"
    return hashlib.sha256(combined.encode("utf-8")).hexdigest()
 def compute_unique_identifier_hash(doc: ConnectorDocument) -> str:
    """Return a stable SHA-256 hash identifying a document by its source identity."""
-    combined = f"{doc.document_type.value}:{doc.unique_id}:{doc.search_space_id}"
+    return compute_identifier_hash(doc.document_type.value, doc.unique_id, doc.search_space_id)
    return hashlib.sha256(combined.encode("utf-8")).hexdigest()
 def compute_content_hash(doc: ConnectorDocument) -> str:
--- a/surfsense_backend/app/indexing_pipeline/indexing_pipeline_service.py
+++ b/surfsense_backend/app/indexing_pipeline/indexing_pipeline_service.py
@ -6,12 +6,13 @@ from sqlalchemy import delete, select
 from sqlalchemy.exc import IntegrityError
 from sqlalchemy.ext.asyncio import AsyncSession
-from app.db import Chunk, Document, DocumentStatus
+from app.db import NATIVE_TO_LEGACY_DOCTYPE, Chunk, Document, DocumentStatus
 from app.indexing_pipeline.connector_document import ConnectorDocument
 from app.indexing_pipeline.document_chunker import chunk_text
 from app.indexing_pipeline.document_embedder import embed_texts
 from app.indexing_pipeline.document_hashing import (
    compute_content_hash,
    compute_identifier_hash,
    compute_unique_identifier_hash,
 )
 from app.indexing_pipeline.document_persistence import (
@ -54,6 +55,62 @@ class IndexingPipelineService:
    def __init__(self, session: AsyncSession) -> None:
        self.session = session
    async def migrate_legacy_docs(
        self, connector_docs: list[ConnectorDocument]
    ) -> None:
        """Migrate legacy Composio documents to their native Google type.
        For each ConnectorDocument whose document_type has a Composio equivalent
        in NATIVE_TO_LEGACY_DOCTYPE, look up the old document by legacy hash and
        update its unique_identifier_hash and document_type so that
        prepare_for_indexing() can find it under the native hash.
        """
        for doc in connector_docs:
            legacy_type = NATIVE_TO_LEGACY_DOCTYPE.get(doc.document_type.value)
            if not legacy_type:
                continue
            legacy_hash = compute_identifier_hash(
                legacy_type, doc.unique_id, doc.search_space_id
            )
            result = await self.session.execute(
                select(Document).filter(
                    Document.unique_identifier_hash == legacy_hash
                )
            )
            existing = result.scalars().first()
            if existing is None:
                continue
            native_hash = compute_identifier_hash(
                doc.document_type.value, doc.unique_id, doc.search_space_id
            )
            existing.unique_identifier_hash = native_hash
            existing.document_type = doc.document_type
        await self.session.commit()
    async def index_batch(
        self, connector_docs: list[ConnectorDocument], llm
    ) -> list[Document]:
        """Convenience method: prepare_for_indexing then index each document.
        Indexers that need heartbeat callbacks or custom per-document logic
        should call prepare_for_indexing() + index() directly instead.
        """
        doc_map = {
            compute_unique_identifier_hash(cd): cd for cd in connector_docs
        }
        documents = await self.prepare_for_indexing(connector_docs)
        results: list[Document] = []
        for document in documents:
            connector_doc = doc_map.get(document.unique_identifier_hash)
            if connector_doc is None:
                continue
            result = await self.index(document, connector_doc, llm)
            results.append(result)
        return results
    async def prepare_for_indexing(
        self, connector_docs: list[ConnectorDocument]
    ) -> list[Document]:
--- a/surfsense_backend/app/tasks/connector_indexers/google_calendar_indexer.py
+++ b/surfsense_backend/app/tasks/connector_indexers/google_calendar_indexer.py
@ -1,9 +1,8 @@
 """
 Google Calendar connector indexer.
-Implements 2-phase document status updates for real-time UI feedback:
+Uses the shared IndexingPipelineService for document deduplication,
- Phase 1: Create all documents with 'pending' status (visible in UI immediately)
+summarization, chunking, and embedding.
 - Phase 2: Process each document: pending → processing → ready/failed
 """
 import time
@ -15,29 +14,25 @@ from sqlalchemy.exc import SQLAlchemyError
 from sqlalchemy.ext.asyncio import AsyncSession
 from app.connectors.google_calendar_connector import GoogleCalendarConnector
-from app.db import Document, DocumentStatus, DocumentType, SearchSourceConnectorType
+from app.db import DocumentType, SearchSourceConnectorType
 from app.indexing_pipeline.connector_document import ConnectorDocument
 from app.indexing_pipeline.document_hashing import (
    compute_content_hash,
    compute_unique_identifier_hash,
 )
 from app.indexing_pipeline.indexing_pipeline_service import IndexingPipelineService
 from app.services.llm_service import get_user_long_context_llm
 from app.services.task_logging_service import TaskLoggingService
 from app.utils.document_converters import (
    create_document_chunks,
    embed_text,
    generate_content_hash,
    generate_document_summary,
    generate_unique_identifier_hash,
 )
 from app.utils.google_credentials import (
    COMPOSIO_GOOGLE_CONNECTOR_TYPES,
    build_composio_credentials,
 )
 from .base import (
    check_document_by_unique_identifier,
    check_duplicate_document_by_hash,
    get_connector_by_id,
    get_current_timestamp,
    logger,
    parse_date_flexible,
    safe_set_chunks,
    update_connector_last_indexed,
 )
@ -46,13 +41,60 @@ ACCEPTED_CALENDAR_CONNECTOR_TYPES = {
    SearchSourceConnectorType.COMPOSIO_GOOGLE_CALENDAR_CONNECTOR,
 }
 # Type hint for heartbeat callback
 HeartbeatCallbackType = Callable[[int], Awaitable[None]]
 # Heartbeat interval in seconds
 HEARTBEAT_INTERVAL_SECONDS = 30
 def _build_connector_doc(
    event: dict,
    event_markdown: str,
    *,
    connector_id: int,
    search_space_id: int,
    user_id: str,
    enable_summary: bool,
 ) -> ConnectorDocument:
    """Map a raw Google Calendar API event dict to a ConnectorDocument."""
    event_id = event.get("id", "")
    event_summary = event.get("summary", "No Title")
    calendar_id = event.get("calendarId", "")
    start = event.get("start", {})
    end = event.get("end", {})
    start_time = start.get("dateTime") or start.get("date", "")
    end_time = end.get("dateTime") or end.get("date", "")
    location = event.get("location", "")
    metadata = {
        "event_id": event_id,
        "event_summary": event_summary,
        "calendar_id": calendar_id,
        "start_time": start_time,
        "end_time": end_time,
        "location": location,
        "connector_id": connector_id,
        "document_type": "Google Calendar Event",
        "connector_type": "Google Calendar",
    }
    fallback_summary = (
        f"Google Calendar Event: {event_summary}\n\n{event_markdown}"
    )
    return ConnectorDocument(
        title=event_summary,
        source_markdown=event_markdown,
        unique_id=event_id,
        document_type=DocumentType.GOOGLE_CALENDAR_CONNECTOR,
        search_space_id=search_space_id,
        connector_id=connector_id,
        created_by_id=user_id,
        should_summarize=enable_summary,
        fallback_summary=fallback_summary,
        metadata=metadata,
    )
 async def index_google_calendar_events(
    session: AsyncSession,
    connector_id: int,
@ -82,7 +124,6 @@ async def index_google_calendar_events(
    """
    task_logger = TaskLoggingService(session, search_space_id)
    # Log task start
    log_entry = await task_logger.log_task_start(
        task_name="google_calendar_events_indexing",
        source="connector_indexing_task",
@ -96,7 +137,7 @@ async def index_google_calendar_events(
    )
    try:
-        # Accept both native and Composio Calendar connectors
+        # ── Connector lookup ──────────────────────────────────────────
        connector = None
        for ct in ACCEPTED_CALENDAR_CONNECTOR_TYPES:
            connector = await get_connector_by_id(session, connector_id, ct)
@ -112,7 +153,7 @@ async def index_google_calendar_events(
            )
            return 0, 0, f"Connector with ID {connector_id} not found"
-        # Build credentials based on connector type
+        # ── Credential building ───────────────────────────────────────
        if connector.connector_type in COMPOSIO_GOOGLE_CONNECTOR_TYPES:
            connected_account_id = connector.config.get("composio_connected_account_id")
            if not connected_account_id:
@ -184,6 +225,7 @@ async def index_google_calendar_events(
                )
                return 0, 0, "Google Calendar credentials not found in connector config"
        # ── Calendar client init ──────────────────────────────────────
        await task_logger.log_task_progress(
            log_entry,
            f"Initializing Google Calendar client for connector {connector_id}",
@ -203,36 +245,26 @@ async def index_google_calendar_events(
        if end_date == "undefined" or end_date == "":
            end_date = None
-        # Calculate date range
+        # ── Date range calculation ────────────────────────────────────
        # For calendar connectors, allow future dates to index upcoming events
        if start_date is None or end_date is None:
            # Fall back to calculating dates based on last_indexed_at
            # Default to today (users can manually select future dates if needed)
            calculated_end_date = datetime.now()
            # Use last_indexed_at as start date if available, otherwise use 30 days ago
            if connector.last_indexed_at:
                # Convert dates to be comparable (both timezone-naive)
                last_indexed_naive = (
                    connector.last_indexed_at.replace(tzinfo=None)
                    if connector.last_indexed_at.tzinfo
                    else connector.last_indexed_at
                )
                # Allow future dates - use last_indexed_at as start date
                calculated_start_date = last_indexed_naive
                logger.info(
                    f"Using last_indexed_at ({calculated_start_date.strftime('%Y-%m-%d')}) as start date"
                )
            else:
-                calculated_start_date = datetime.now() - timedelta(
+                calculated_start_date = datetime.now() - timedelta(days=365)
                    days=365
                )  # Use 365 days as default for calendar events (matches frontend)
                logger.info(
                    f"No last_indexed_at found, using {calculated_start_date.strftime('%Y-%m-%d')} (365 days ago) as start date"
                )
            # Use calculated dates if not provided
            start_date_str = (
                start_date if start_date else calculated_start_date.strftime("%Y-%m-%d")
            )
@ -240,19 +272,14 @@ async def index_google_calendar_events(
                end_date if end_date else calculated_end_date.strftime("%Y-%m-%d")
            )
        else:
            # Use provided dates (including future dates)
            start_date_str = start_date
            end_date_str = end_date
        # FIX: Ensure end_date is at least 1 day after start_date to avoid
        # "start_date must be strictly before end_date" errors when dates are the same
        # (e.g., when last_indexed_at is today)
        if start_date_str == end_date_str:
            logger.info(
                f"Start date ({start_date_str}) equals end date ({end_date_str}), "
                "adjusting end date to next day to ensure valid date range"
            )
            # Parse end_date and add 1 day
            try:
                end_dt = parse_date_flexible(end_date_str)
            except ValueError:
@ -264,6 +291,7 @@ async def index_google_calendar_events(
            end_date_str = end_dt.strftime("%Y-%m-%d")
            logger.info(f"Adjusted end date to {end_date_str}")
        # ── Fetch events ──────────────────────────────────────────────
        await task_logger.log_task_progress(
            log_entry,
            f"Fetching Google Calendar events from {start_date_str} to {end_date_str}",
@ -274,27 +302,19 @@ async def index_google_calendar_events(
            },
        )
        # Get events within date range from primary calendar
        try:
            events, error = await calendar_client.get_all_primary_calendar_events(
                start_date=start_date_str, end_date=end_date_str
            )
            if error:
                # Don't treat "No events found" as an error that should stop indexing
                if "No events found" in error:
                    logger.info(f"No Google Calendar events found: {error}")
                    logger.info(
                        "No events found is not a critical error, continuing with update"
                    )
                    if update_last_indexed:
                        await update_connector_last_indexed(
                            session, connector, update_last_indexed
                        )
                        await session.commit()
                        logger.info(
                            f"Updated last_indexed_at to {connector.last_indexed_at} despite no events found"
                        )
                    await task_logger.log_task_success(
                        log_entry,
@ -304,7 +324,6 @@ async def index_google_calendar_events(
                    return 0, 0, None
                else:
                    logger.error(f"Failed to get Google Calendar events: {error}")
                    # Check if this is an authentication error that requires re-authentication
                    error_message = error
                    error_type = "APIError"
                    if (
@ -329,28 +348,15 @@ async def index_google_calendar_events(
            logger.error(f"Error fetching Google Calendar events: {e!s}", exc_info=True)
            return 0, 0, f"Error fetching Google Calendar events: {e!s}"
-        documents_indexed = 0
+        # ── Build ConnectorDocuments ──────────────────────────────────
        connector_docs: list[ConnectorDocument] = []
        documents_skipped = 0
-        documents_failed = 0  # Track events that failed processing
+        duplicate_content_count = 0
        duplicate_content_count = (
            0  # Track events skipped due to duplicate content_hash
        )
        # Heartbeat tracking - update notification periodically to prevent appearing stuck
        last_heartbeat_time = time.time()
        # =======================================================================
        # PHASE 1: Analyze all events, create pending documents
        # This makes ALL documents visible in the UI immediately with pending status
        # =======================================================================
        events_to_process = []  # List of dicts with document and event data
        new_documents_created = False
        for event in events:
            try:
                event_id = event.get("id")
                event_summary = event.get("summary", "No Title")
                calendar_id = event.get("calendarId", "")
                if not event_id:
                    logger.warning(f"Skipping event with missing ID: {event_summary}")
@ -363,223 +369,73 @@ async def index_google_calendar_events(
                    documents_skipped += 1
                    continue
-                start = event.get("start", {})
+                doc = _build_connector_doc(
-                end = event.get("end", {})
+                    event,
-                start_time = start.get("dateTime") or start.get("date", "")
+                    event_markdown,
-                end_time = end.get("dateTime") or end.get("date", "")
+                    connector_id=connector_id,
-                location = event.get("location", "")
+                    search_space_id=search_space_id,
-                description = event.get("description", "")
+                    user_id=user_id,
-
+                    enable_summary=connector.enable_summary,
                # Generate unique identifier hash for this Google Calendar event
                unique_identifier_hash = generate_unique_identifier_hash(
                    DocumentType.GOOGLE_CALENDAR_CONNECTOR, event_id, search_space_id
                )
                # Generate content hash
                content_hash = generate_content_hash(event_markdown, search_space_id)
                # Check if document with this unique identifier already exists
                existing_document = await check_document_by_unique_identifier(
                    session, unique_identifier_hash
                )
                # Fallback: legacy Composio hash
                if not existing_document:
                    legacy_hash = generate_unique_identifier_hash(
                        DocumentType.COMPOSIO_GOOGLE_CALENDAR_CONNECTOR,
                        event_id,
                        search_space_id,
                    )
                    existing_document = await check_document_by_unique_identifier(
                        session, legacy_hash
                    )
                    if existing_document:
                        existing_document.unique_identifier_hash = (
                            unique_identifier_hash
                        )
                        if (
                            existing_document.document_type
                            == DocumentType.COMPOSIO_GOOGLE_CALENDAR_CONNECTOR
                        ):
                            existing_document.document_type = (
                                DocumentType.GOOGLE_CALENDAR_CONNECTOR
                            )
                        logger.info(
                            f"Migrated legacy Composio Calendar document: {event_id}"
                        )
                if existing_document:
                    # Document exists - check if content has changed
                    if existing_document.content_hash == content_hash:
                        # Ensure status is ready (might have been stuck in processing/pending)
                        if not DocumentStatus.is_state(
                            existing_document.status, DocumentStatus.READY
                        ):
                            existing_document.status = DocumentStatus.ready()
                        documents_skipped += 1
                        continue
                    # Queue existing document for update (will be set to processing in Phase 2)
                    events_to_process.append(
                        {
                            "document": existing_document,
                            "is_new": False,
                            "event_markdown": event_markdown,
                            "content_hash": content_hash,
                            "event_id": event_id,
                            "event_summary": event_summary,
                            "calendar_id": calendar_id,
                            "start_time": start_time,
                            "end_time": end_time,
                            "location": location,
                            "description": description,
                        }
                    )
                    continue
                # Document doesn't exist by unique_identifier_hash
                # Check if a document with the same content_hash exists (from another connector)
                with session.no_autoflush:
-                    duplicate_by_content = await check_duplicate_document_by_hash(
+                    duplicate = await check_duplicate_document_by_hash(
-                        session, content_hash
+                        session, compute_content_hash(doc)
                    )
-
+                if duplicate:
                if duplicate_by_content:
                    # A document with the same content already exists (likely from Composio connector)
                    logger.info(
-                        f"Event {event_summary} already indexed by another connector "
+                        f"Event {doc.title} already indexed by another connector "
-                        f"(existing document ID: {duplicate_by_content.id}, "
+                        f"(existing document ID: {duplicate.id}, "
-                        f"type: {duplicate_by_content.document_type}). Skipping to avoid duplicate content."
+                        f"type: {duplicate.document_type}). Skipping."
                    )
                    duplicate_content_count += 1
                    documents_skipped += 1
                    continue
-                # Create new document with PENDING status (visible in UI immediately)
+                connector_docs.append(doc)
                document = Document(
                    search_space_id=search_space_id,
                    title=event_summary,
                    document_type=DocumentType.GOOGLE_CALENDAR_CONNECTOR,
                    document_metadata={
                        "event_id": event_id,
                        "event_summary": event_summary,
                        "calendar_id": calendar_id,
                        "start_time": start_time,
                        "end_time": end_time,
                        "location": location,
                        "connector_id": connector_id,
                    },
                    content="Pending...",  # Placeholder until processed
                    content_hash=unique_identifier_hash,  # Temporary unique value - updated when ready
                    unique_identifier_hash=unique_identifier_hash,
                    embedding=None,
                    chunks=[],  # Empty at creation - safe for async
                    status=DocumentStatus.pending(),  # Pending until processing starts
                    updated_at=get_current_timestamp(),
                    created_by_id=user_id,
                    connector_id=connector_id,
                )
                session.add(document)
                new_documents_created = True
                events_to_process.append(
                    {
                        "document": document,
                        "is_new": True,
                        "event_markdown": event_markdown,
                        "content_hash": content_hash,
                        "event_id": event_id,
                        "event_summary": event_summary,
                        "calendar_id": calendar_id,
                        "start_time": start_time,
                        "end_time": end_time,
                        "location": location,
                        "description": description,
                    }
                )
            except Exception as e:
-                logger.error(f"Error in Phase 1 for event: {e!s}", exc_info=True)
+                logger.error(f"Error building ConnectorDocument for event: {e!s}", exc_info=True)
-                documents_failed += 1
+                documents_skipped += 1
                continue
-        # Commit all pending documents - they all appear in UI now
+        # ── Pipeline: migrate legacy docs + prepare + index ───────────
-        if new_documents_created:
+        pipeline = IndexingPipelineService(session)
            logger.info(
                f"Phase 1: Committing {len([e for e in events_to_process if e['is_new']])} pending documents"
            )
            await session.commit()
-        # =======================================================================
+        await pipeline.migrate_legacy_docs(connector_docs)
        # PHASE 2: Process each document one by one
        # Each document transitions: pending → processing → ready/failed
        # =======================================================================
        logger.info(f"Phase 2: Processing {len(events_to_process)} documents")
-        for item in events_to_process:
+        documents = await pipeline.prepare_for_indexing(connector_docs)
-            # Send heartbeat periodically
+
        doc_map = {
            compute_unique_identifier_hash(cd): cd for cd in connector_docs
        }
        documents_indexed = 0
        documents_failed = 0
        last_heartbeat_time = time.time()
        for document in documents:
            if on_heartbeat_callback:
                current_time = time.time()
                if current_time - last_heartbeat_time >= HEARTBEAT_INTERVAL_SECONDS:
                    await on_heartbeat_callback(documents_indexed)
                    last_heartbeat_time = current_time
-            document = item["document"]
+            connector_doc = doc_map.get(document.unique_identifier_hash)
-            try:
+            if connector_doc is None:
-                # Set to PROCESSING and commit - shows "processing" in UI for THIS document only
+                logger.warning(
-                document.status = DocumentStatus.processing()
+                    f"No matching ConnectorDocument for document {document.id}, skipping"
-                await session.commit()
+                )
                documents_failed += 1
                continue
-                # Heavy processing (LLM, embeddings, chunks)
+            try:
                user_llm = await get_user_long_context_llm(
                    session, user_id, search_space_id
                )
-
+                await pipeline.index(document, connector_doc, user_llm)
                if user_llm and connector.enable_summary:
                    document_metadata_for_summary = {
                        "event_id": item["event_id"],
                        "event_summary": item["event_summary"],
                        "calendar_id": item["calendar_id"],
                        "start_time": item["start_time"],
                        "end_time": item["end_time"],
                        "location": item["location"] or "No location",
                        "document_type": "Google Calendar Event",
                        "connector_type": "Google Calendar",
                    }
                    (
                        summary_content,
                        summary_embedding,
                    ) = await generate_document_summary(
                        item["event_markdown"], user_llm, document_metadata_for_summary
                    )
                else:
                    summary_content = f"Google Calendar Event: {item['event_summary']}\n\n{item['event_markdown']}"
                    summary_embedding = embed_text(summary_content)
                chunks = await create_document_chunks(item["event_markdown"])
                # Update document to READY with actual content
                document.title = item["event_summary"]
                document.content = summary_content
                document.content_hash = item["content_hash"]
                document.embedding = summary_embedding
                document.document_metadata = {
                    "event_id": item["event_id"],
                    "event_summary": item["event_summary"],
                    "calendar_id": item["calendar_id"],
                    "start_time": item["start_time"],
                    "end_time": item["end_time"],
                    "location": item["location"],
                    "indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
                    "connector_id": connector_id,
                }
                await safe_set_chunks(session, document, chunks)
                document.updated_at = get_current_timestamp()
                document.status = DocumentStatus.ready()
                documents_indexed += 1
                # Batch commit every 10 documents (for ready status updates)
                if documents_indexed % 10 == 0:
                    logger.info(
                        f"Committing batch: {documents_indexed} Google Calendar events processed so far"
@ -588,21 +444,12 @@ async def index_google_calendar_events(
            except Exception as e:
                logger.error(f"Error processing Calendar event: {e!s}", exc_info=True)
                # Mark document as failed with reason (visible in UI)
                try:
                    document.status = DocumentStatus.failed(str(e))
                    document.updated_at = get_current_timestamp()
                except Exception as status_error:
                    logger.error(
                        f"Failed to update document status to failed: {status_error}"
                    )
                documents_failed += 1
                continue
-        # CRITICAL: Always update timestamp (even if 0 documents indexed) so Zero syncs
+        # ── Finalize ──────────────────────────────────────────────────
        await update_connector_last_indexed(session, connector, update_last_indexed)
        # Final commit for any remaining documents not yet committed in batches
        logger.info(
            f"Final commit: Total {documents_indexed} Google Calendar events processed"
        )
@ -612,22 +459,18 @@ async def index_google_calendar_events(
                "Successfully committed all Google Calendar document changes to database"
            )
        except Exception as e:
            # Handle any remaining integrity errors gracefully (race conditions, etc.)
            if (
                "duplicate key value violates unique constraint" in str(e).lower()
                or "uniqueviolationerror" in str(e).lower()
            ):
                logger.warning(
                    f"Duplicate content_hash detected during final commit. "
                    f"This may occur if the same event was indexed by multiple connectors. "
                    f"Rolling back and continuing. Error: {e!s}"
                )
                await session.rollback()
                # Don't fail the entire task - some documents may have been successfully indexed
            else:
                raise
        # Build warning message if there were issues
        warning_parts = []
        if duplicate_content_count > 0:
            warning_parts.append(f"{duplicate_content_count} duplicate")
--- a/surfsense_backend/app/tasks/connector_indexers/google_drive_indexer.py
+++ b/surfsense_backend/app/tasks/connector_indexers/google_drive_indexer.py
--- a/surfsense_backend/app/tasks/connector_indexers/google_gmail_indexer.py
+++ b/surfsense_backend/app/tasks/connector_indexers/google_gmail_indexer.py
@ -1,11 +1,11 @@
 """
 Google Gmail connector indexer.
-Implements 2-phase document status updates for real-time UI feedback:
+Uses the shared IndexingPipelineService for document deduplication,
- Phase 1: Create all documents with 'pending' status (visible in UI immediately)
+summarization, chunking, and embedding.
 - Phase 2: Process each document: pending → processing → ready/failed
 """
 import logging
 import time
 from collections.abc import Awaitable, Callable
 from datetime import datetime
@ -15,21 +15,15 @@ from sqlalchemy.exc import SQLAlchemyError
 from sqlalchemy.ext.asyncio import AsyncSession
 from app.connectors.google_gmail_connector import GoogleGmailConnector
-from app.db import (
+from app.db import DocumentType, SearchSourceConnectorType
-    Document,
+from app.indexing_pipeline.connector_document import ConnectorDocument
-    DocumentStatus,
+from app.indexing_pipeline.document_hashing import (
-    DocumentType,
+    compute_content_hash,
-    SearchSourceConnectorType,
+    compute_unique_identifier_hash,
 )
 from app.indexing_pipeline.indexing_pipeline_service import IndexingPipelineService
 from app.services.llm_service import get_user_long_context_llm
 from app.services.task_logging_service import TaskLoggingService
 from app.utils.document_converters import (
    create_document_chunks,
    embed_text,
    generate_content_hash,
    generate_document_summary,
    generate_unique_identifier_hash,
 )
 from app.utils.google_credentials import (
    COMPOSIO_GOOGLE_CONNECTOR_TYPES,
    build_composio_credentials,
@ -37,12 +31,9 @@ from app.utils.google_credentials import (
 from .base import (
    calculate_date_range,
    check_document_by_unique_identifier,
    check_duplicate_document_by_hash,
    get_connector_by_id,
    get_current_timestamp,
    logger,
    safe_set_chunks,
    update_connector_last_indexed,
 )
@ -51,13 +42,70 @@ ACCEPTED_GMAIL_CONNECTOR_TYPES = {
    SearchSourceConnectorType.COMPOSIO_GMAIL_CONNECTOR,
 }
 # Type hint for heartbeat callback
 HeartbeatCallbackType = Callable[[int], Awaitable[None]]
 # Heartbeat interval in seconds
 HEARTBEAT_INTERVAL_SECONDS = 30
 def _build_connector_doc(
    message: dict,
    markdown_content: str,
    *,
    connector_id: int,
    search_space_id: int,
    user_id: str,
    enable_summary: bool,
 ) -> ConnectorDocument:
    """Map a raw Gmail API message dict to a ConnectorDocument."""
    message_id = message.get("id", "")
    thread_id = message.get("threadId", "")
    payload = message.get("payload", {})
    headers = payload.get("headers", [])
    subject = "No Subject"
    sender = "Unknown Sender"
    date_str = "Unknown Date"
    for header in headers:
        name = header.get("name", "").lower()
        value = header.get("value", "")
        if name == "subject":
            subject = value
        elif name == "from":
            sender = value
        elif name == "date":
            date_str = value
    metadata = {
        "message_id": message_id,
        "thread_id": thread_id,
        "subject": subject,
        "sender": sender,
        "date": date_str,
        "connector_id": connector_id,
        "document_type": "Gmail Message",
        "connector_type": "Google Gmail",
    }
    fallback_summary = (
        f"Google Gmail Message: {subject}\n\n"
        f"From: {sender}\nDate: {date_str}\n\n"
        f"{markdown_content}"
    )
    return ConnectorDocument(
        title=subject,
        source_markdown=markdown_content,
        unique_id=message_id,
        document_type=DocumentType.GOOGLE_GMAIL_CONNECTOR,
        search_space_id=search_space_id,
        connector_id=connector_id,
        created_by_id=user_id,
        should_summarize=enable_summary,
        fallback_summary=fallback_summary,
        metadata=metadata,
    )
 async def index_google_gmail_messages(
    session: AsyncSession,
    connector_id: int,
@ -80,7 +128,7 @@ async def index_google_gmail_messages(
        start_date: Start date for filtering messages (YYYY-MM-DD format)
        end_date: End date for filtering messages (YYYY-MM-DD format)
        update_last_indexed: Whether to update the last_indexed_at timestamp (default: True)
-        max_messages: Maximum number of messages to fetch (default: 100)
+        max_messages: Maximum number of messages to fetch (default: 1000)
        on_heartbeat_callback: Optional callback to update notification during long-running indexing.
    Returns:
@ -88,7 +136,6 @@ async def index_google_gmail_messages(
    """
    task_logger = TaskLoggingService(session, search_space_id)
    # Log task start
    log_entry = await task_logger.log_task_start(
        task_name="google_gmail_messages_indexing",
        source="connector_indexing_task",
@ -103,7 +150,7 @@ async def index_google_gmail_messages(
    )
    try:
-        # Accept both native and Composio Gmail connectors
+        # ── Connector lookup ──────────────────────────────────────────
        connector = None
        for ct in ACCEPTED_GMAIL_CONNECTOR_TYPES:
            connector = await get_connector_by_id(session, connector_id, ct)
@ -117,7 +164,7 @@ async def index_google_gmail_messages(
            )
            return 0, 0, error_msg
-        # Build credentials based on connector type
+        # ── Credential building ───────────────────────────────────────
        if connector.connector_type in COMPOSIO_GOOGLE_CONNECTOR_TYPES:
            connected_account_id = connector.config.get("composio_connected_account_id")
            if not connected_account_id:
@ -189,6 +236,7 @@ async def index_google_gmail_messages(
                )
                return 0, 0, "Google gmail credentials not found in connector config"
        # ── Gmail client init ─────────────────────────────────────────
        await task_logger.log_task_progress(
            log_entry,
            f"Initializing Google gmail client for connector {connector_id}",
@ -199,14 +247,11 @@ async def index_google_gmail_messages(
            credentials, session, user_id, connector_id
        )
        # Calculate date range using last_indexed_at if dates not provided
        # This ensures Gmail uses the same date logic as other connectors
        # (uses last_indexed_at → now, or 365 days back for first-time indexing)
        calculated_start_date, calculated_end_date = calculate_date_range(
            connector, start_date, end_date, default_days_back=365
        )
-        # Fetch recent Google gmail messages
+        # ── Fetch messages ────────────────────────────────────────────
        logger.info(
            f"Fetching emails for connector {connector_id} "
            f"from {calculated_start_date} to {calculated_end_date}"
@ -218,7 +263,6 @@ async def index_google_gmail_messages(
        )
        if error:
            # Check if this is an authentication error that requires re-authentication
            error_message = error
            error_type = "APIError"
            if (
@ -243,263 +287,92 @@ async def index_google_gmail_messages(
        logger.info(f"Found {len(messages)} Google gmail messages to index")
-        documents_indexed = 0
+        # ── Build ConnectorDocuments ──────────────────────────────────
        connector_docs: list[ConnectorDocument] = []
        documents_skipped = 0
-        documents_failed = 0  # Track messages that failed processing
+        duplicate_content_count = 0
        duplicate_content_count = (
            0  # Track messages skipped due to duplicate content_hash
        )
        # Heartbeat tracking - update notification periodically to prevent appearing stuck
        last_heartbeat_time = time.time()
        # =======================================================================
        # PHASE 1: Analyze all messages, create pending documents
        # This makes ALL documents visible in the UI immediately with pending status
        # =======================================================================
        messages_to_process = []  # List of dicts with document and message data
        new_documents_created = False
        for message in messages:
            try:
                # Extract message information
                message_id = message.get("id", "")
                thread_id = message.get("threadId", "")
                # Extract headers for subject and sender
                payload = message.get("payload", {})
                headers = payload.get("headers", [])
                subject = "No Subject"
                sender = "Unknown Sender"
                date_str = "Unknown Date"
                for header in headers:
                    name = header.get("name", "").lower()
                    value = header.get("value", "")
                    if name == "subject":
                        subject = value
                    elif name == "from":
                        sender = value
                    elif name == "date":
                        date_str = value
                if not message_id:
-                    logger.warning(f"Skipping message with missing ID: {subject}")
+                    logger.warning("Skipping message with missing ID")
                    documents_skipped += 1
                    continue
                # Format message to markdown
                markdown_content = gmail_connector.format_message_to_markdown(message)
                if not markdown_content.strip():
-                    logger.warning(f"Skipping message with no content: {subject}")
+                    logger.warning(f"Skipping message with no content: {message_id}")
                    documents_skipped += 1
                    continue
-                # Generate unique identifier hash for this Gmail message
+                doc = _build_connector_doc(
-                unique_identifier_hash = generate_unique_identifier_hash(
+                    message,
-                    DocumentType.GOOGLE_GMAIL_CONNECTOR, message_id, search_space_id
+                    markdown_content,
                    connector_id=connector_id,
                    search_space_id=search_space_id,
                    user_id=user_id,
                    enable_summary=connector.enable_summary,
                )
                # Generate content hash
                content_hash = generate_content_hash(markdown_content, search_space_id)
                # Check if document with this unique identifier already exists
                existing_document = await check_document_by_unique_identifier(
                    session, unique_identifier_hash
                )
                # Fallback: legacy Composio hash
                if not existing_document:
                    legacy_hash = generate_unique_identifier_hash(
                        DocumentType.COMPOSIO_GMAIL_CONNECTOR,
                        message_id,
                        search_space_id,
                    )
                    existing_document = await check_document_by_unique_identifier(
                        session, legacy_hash
                    )
                    if existing_document:
                        existing_document.unique_identifier_hash = (
                            unique_identifier_hash
                        )
                        if (
                            existing_document.document_type
                            == DocumentType.COMPOSIO_GMAIL_CONNECTOR
                        ):
                            existing_document.document_type = (
                                DocumentType.GOOGLE_GMAIL_CONNECTOR
                            )
                        logger.info(
                            f"Migrated legacy Composio Gmail document: {message_id}"
                        )
                if existing_document:
                    # Document exists - check if content has changed
                    if existing_document.content_hash == content_hash:
                        # Ensure status is ready (might have been stuck in processing/pending)
                        if not DocumentStatus.is_state(
                            existing_document.status, DocumentStatus.READY
                        ):
                            existing_document.status = DocumentStatus.ready()
                        documents_skipped += 1
                        continue
                    # Queue existing document for update (will be set to processing in Phase 2)
                    messages_to_process.append(
                        {
                            "document": existing_document,
                            "is_new": False,
                            "markdown_content": markdown_content,
                            "content_hash": content_hash,
                            "message_id": message_id,
                            "thread_id": thread_id,
                            "subject": subject,
                            "sender": sender,
                            "date_str": date_str,
                        }
                    )
                    continue
                # Document doesn't exist by unique_identifier_hash
                # Check if a document with the same content_hash exists (from another connector)
                with session.no_autoflush:
-                    duplicate_by_content = await check_duplicate_document_by_hash(
+                    duplicate = await check_duplicate_document_by_hash(
-                        session, content_hash
+                        session, compute_content_hash(doc)
                    )
-
+                if duplicate:
                if duplicate_by_content:
                    logger.info(
-                        f"Gmail message {subject} already indexed by another connector "
+                        f"Gmail message {doc.title} already indexed by another connector "
-                        f"(existing document ID: {duplicate_by_content.id}, "
+                        f"(existing document ID: {duplicate.id}, "
-                        f"type: {duplicate_by_content.document_type}). Skipping."
+                        f"type: {duplicate.document_type}). Skipping."
                    )
                    duplicate_content_count += 1
                    documents_skipped += 1
                    continue
-                # Create new document with PENDING status (visible in UI immediately)
+                connector_docs.append(doc)
                document = Document(
                    search_space_id=search_space_id,
                    title=subject,
                    document_type=DocumentType.GOOGLE_GMAIL_CONNECTOR,
                    document_metadata={
                        "message_id": message_id,
                        "thread_id": thread_id,
                        "subject": subject,
                        "sender": sender,
                        "date": date_str,
                        "connector_id": connector_id,
                    },
                    content="Pending...",  # Placeholder until processed
                    content_hash=unique_identifier_hash,  # Temporary unique value - updated when ready
                    unique_identifier_hash=unique_identifier_hash,
                    embedding=None,
                    chunks=[],  # Empty at creation - safe for async
                    status=DocumentStatus.pending(),  # Pending until processing starts
                    updated_at=get_current_timestamp(),
                    created_by_id=user_id,
                    connector_id=connector_id,
                )
                session.add(document)
                new_documents_created = True
                messages_to_process.append(
                    {
                        "document": document,
                        "is_new": True,
                        "markdown_content": markdown_content,
                        "content_hash": content_hash,
                        "message_id": message_id,
                        "thread_id": thread_id,
                        "subject": subject,
                        "sender": sender,
                        "date_str": date_str,
                    }
                )
            except Exception as e:
-                logger.error(f"Error in Phase 1 for message: {e!s}", exc_info=True)
+                logger.error(f"Error building ConnectorDocument for message: {e!s}", exc_info=True)
-                documents_failed += 1
+                documents_skipped += 1
                continue
-        # Commit all pending documents - they all appear in UI now
+        # ── Pipeline: migrate legacy docs + prepare + index ───────────
-        if new_documents_created:
+        pipeline = IndexingPipelineService(session)
            logger.info(
                f"Phase 1: Committing {len([m for m in messages_to_process if m['is_new']])} pending documents"
            )
            await session.commit()
-        # =======================================================================
+        await pipeline.migrate_legacy_docs(connector_docs)
        # PHASE 2: Process each document one by one
        # Each document transitions: pending → processing → ready/failed
        # =======================================================================
        logger.info(f"Phase 2: Processing {len(messages_to_process)} documents")
-        for item in messages_to_process:
+        documents = await pipeline.prepare_for_indexing(connector_docs)
-            # Send heartbeat periodically
+
        doc_map = {
            compute_unique_identifier_hash(cd): cd for cd in connector_docs
        }
        documents_indexed = 0
        documents_failed = 0
        last_heartbeat_time = time.time()
        for document in documents:
            if on_heartbeat_callback:
                current_time = time.time()
                if current_time - last_heartbeat_time >= HEARTBEAT_INTERVAL_SECONDS:
                    await on_heartbeat_callback(documents_indexed)
                    last_heartbeat_time = current_time
-            document = item["document"]
+            connector_doc = doc_map.get(document.unique_identifier_hash)
-            try:
+            if connector_doc is None:
-                # Set to PROCESSING and commit - shows "processing" in UI for THIS document only
+                logger.warning(
-                document.status = DocumentStatus.processing()
+                    f"No matching ConnectorDocument for document {document.id}, skipping"
-                await session.commit()
+                )
                documents_failed += 1
                continue
-                # Heavy processing (LLM, embeddings, chunks)
+            try:
                user_llm = await get_user_long_context_llm(
                    session, user_id, search_space_id
                )
-
+                await pipeline.index(document, connector_doc, user_llm)
                if user_llm and connector.enable_summary:
                    document_metadata_for_summary = {
                        "message_id": item["message_id"],
                        "thread_id": item["thread_id"],
                        "subject": item["subject"],
                        "sender": item["sender"],
                        "date": item["date_str"],
                        "document_type": "Gmail Message",
                        "connector_type": "Google Gmail",
                    }
                    (
                        summary_content,
                        summary_embedding,
                    ) = await generate_document_summary(
                        item["markdown_content"],
                        user_llm,
                        document_metadata_for_summary,
                    )
                else:
                    summary_content = f"Google Gmail Message: {item['subject']}\n\nFrom: {item['sender']}\nDate: {item['date_str']}\n\n{item['markdown_content']}"
                    summary_embedding = embed_text(summary_content)
                chunks = await create_document_chunks(item["markdown_content"])
                # Update document to READY with actual content
                document.title = item["subject"]
                document.content = summary_content
                document.content_hash = item["content_hash"]
                document.embedding = summary_embedding
                document.document_metadata = {
                    "message_id": item["message_id"],
                    "thread_id": item["thread_id"],
                    "subject": item["subject"],
                    "sender": item["sender"],
                    "date": item["date_str"],
                    "connector_id": connector_id,
                }
                await safe_set_chunks(session, document, chunks)
                document.updated_at = get_current_timestamp()
                document.status = DocumentStatus.ready()
                documents_indexed += 1
                # Batch commit every 10 documents (for ready status updates)
                if documents_indexed % 10 == 0:
                    logger.info(
                        f"Committing batch: {documents_indexed} Gmail messages processed so far"
@ -508,21 +381,12 @@ async def index_google_gmail_messages(
            except Exception as e:
                logger.error(f"Error processing Gmail message: {e!s}", exc_info=True)
                # Mark document as failed with reason (visible in UI)
                try:
                    document.status = DocumentStatus.failed(str(e))
                    document.updated_at = get_current_timestamp()
                except Exception as status_error:
                    logger.error(
                        f"Failed to update document status to failed: {status_error}"
                    )
                documents_failed += 1
                continue
-        # CRITICAL: Always update timestamp (even if 0 documents indexed) so Zero syncs
+        # ── Finalize ──────────────────────────────────────────────────
        await update_connector_last_indexed(session, connector, update_last_indexed)
        # Final commit for any remaining documents not yet committed in batches
        logger.info(f"Final commit: Total {documents_indexed} Gmail messages processed")
        try:
            await session.commit()
@ -530,22 +394,18 @@ async def index_google_gmail_messages(
                "Successfully committed all Google Gmail document changes to database"
            )
        except Exception as e:
            # Handle any remaining integrity errors gracefully (race conditions, etc.)
            if (
                "duplicate key value violates unique constraint" in str(e).lower()
                or "uniqueviolationerror" in str(e).lower()
            ):
                logger.warning(
                    f"Duplicate content_hash detected during final commit. "
                    f"This may occur if the same message was indexed by multiple connectors. "
                    f"Rolling back and continuing. Error: {e!s}"
                )
                await session.rollback()
                # Don't fail the entire task - some documents may have been successfully indexed
            else:
                raise
        # Build warning message if there were issues
        warning_parts = []
        if duplicate_content_count > 0:
            warning_parts.append(f"{duplicate_content_count} duplicate")
@ -555,7 +415,6 @@ async def index_google_gmail_messages(
        total_processed = documents_indexed
        # Log success
        await task_logger.log_task_success(
            log_entry,
            f"Successfully completed Google Gmail indexing for connector {connector_id}",
--- a/surfsense_backend/tests/unit/indexing_pipeline/test_document_hashing.py
+++ b/surfsense_backend/tests/unit/indexing_pipeline/test_document_hashing.py
@ -3,6 +3,7 @@ import pytest
 from app.db import DocumentType
 from app.indexing_pipeline.document_hashing import (
    compute_content_hash,
    compute_identifier_hash,
    compute_unique_identifier_hash,
 )
@ -61,3 +62,23 @@ def test_different_content_produces_different_content_hash(make_connector_docume
    doc_a = make_connector_document(source_markdown="Original content")
    doc_b = make_connector_document(source_markdown="Updated content")
    assert compute_content_hash(doc_a) != compute_content_hash(doc_b)
 def test_compute_identifier_hash_matches_connector_doc_hash(make_connector_document):
    """Raw-args hash equals ConnectorDocument hash for equivalent inputs."""
    doc = make_connector_document(
        document_type=DocumentType.GOOGLE_GMAIL_CONNECTOR,
        unique_id="msg-123",
        search_space_id=5,
    )
    raw_hash = compute_identifier_hash("GOOGLE_GMAIL_CONNECTOR", "msg-123", 5)
    assert raw_hash == compute_unique_identifier_hash(doc)
 def test_compute_identifier_hash_differs_for_different_inputs():
    """Different arguments produce different hashes."""
    h1 = compute_identifier_hash("GOOGLE_DRIVE_FILE", "file-1", 1)
    h2 = compute_identifier_hash("GOOGLE_DRIVE_FILE", "file-2", 1)
    h3 = compute_identifier_hash("GOOGLE_DRIVE_FILE", "file-1", 2)
    h4 = compute_identifier_hash("COMPOSIO_GOOGLE_DRIVE_CONNECTOR", "file-1", 1)
    assert len({h1, h2, h3, h4}) == 4