feat: enhance Google connectors indexing with content extraction and document migration

- Added `download_and_extract_content` function to extract content from Google Drive files as markdown.
- Updated Google Drive indexer to utilize the new content extraction method.
- Implemented document migration logic to update legacy Composio document types to their native Google types.
- Introduced identifier hashing for stable document identification.
- Improved file pre-filtering to handle unchanged and rename-only files efficiently.
This commit is contained in:
Anish Sarkar 2026-03-25 18:33:44 +05:30
parent 2da6fd89ea
commit f7b52470eb
8 changed files with 951 additions and 1588 deletions

View file

@ -2,13 +2,14 @@
from .change_tracker import categorize_change, fetch_all_changes, get_start_page_token from .change_tracker import categorize_change, fetch_all_changes, get_start_page_token
from .client import GoogleDriveClient from .client import GoogleDriveClient
from .content_extractor import download_and_process_file from .content_extractor import download_and_extract_content, download_and_process_file
from .credentials import get_valid_credentials, validate_credentials from .credentials import get_valid_credentials, validate_credentials
from .folder_manager import get_file_by_id, get_files_in_folder, list_folder_contents from .folder_manager import get_file_by_id, get_files_in_folder, list_folder_contents
__all__ = [ __all__ = [
"GoogleDriveClient", "GoogleDriveClient",
"categorize_change", "categorize_change",
"download_and_extract_content",
"download_and_process_file", "download_and_process_file",
"fetch_all_changes", "fetch_all_changes",
"get_file_by_id", "get_file_by_id",

View file

@ -17,6 +17,160 @@ from .file_types import get_export_mime_type, is_google_workspace_file, should_s
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
async def download_and_extract_content(
client: GoogleDriveClient,
file: dict[str, Any],
) -> tuple[str | None, dict[str, Any], str | None]:
"""Download a Google Drive file and extract its content as markdown.
ETL only -- no DB writes, no indexing, no summarization.
Returns:
(markdown_content, drive_metadata, error_message)
On success error_message is None.
"""
file_id = file.get("id")
file_name = file.get("name", "Unknown")
mime_type = file.get("mimeType", "")
if should_skip_file(mime_type):
return None, {}, f"Skipping {mime_type}"
logger.info(f"Downloading file for content extraction: {file_name} ({mime_type})")
drive_metadata: dict[str, Any] = {
"google_drive_file_id": file_id,
"google_drive_file_name": file_name,
"google_drive_mime_type": mime_type,
"source_connector": "google_drive",
}
if "modifiedTime" in file:
drive_metadata["modified_time"] = file["modifiedTime"]
if "createdTime" in file:
drive_metadata["created_time"] = file["createdTime"]
if "size" in file:
drive_metadata["file_size"] = file["size"]
if "webViewLink" in file:
drive_metadata["web_view_link"] = file["webViewLink"]
if "md5Checksum" in file:
drive_metadata["md5_checksum"] = file["md5Checksum"]
if is_google_workspace_file(mime_type):
drive_metadata["exported_as"] = "pdf"
drive_metadata["original_workspace_type"] = mime_type.split(".")[-1]
temp_file_path = None
try:
# Download / export
if is_google_workspace_file(mime_type):
export_mime = get_export_mime_type(mime_type)
if not export_mime:
return None, drive_metadata, f"Cannot export Google Workspace type: {mime_type}"
content_bytes, error = await client.export_google_file(file_id, export_mime)
if error:
return None, drive_metadata, error
extension = ".pdf" if export_mime == "application/pdf" else ".txt"
else:
content_bytes, error = await client.download_file(file_id)
if error:
return None, drive_metadata, error
extension = Path(file_name).suffix or ".bin"
with tempfile.NamedTemporaryFile(delete=False, suffix=extension) as tmp:
tmp.write(content_bytes)
temp_file_path = tmp.name
# Parse to markdown
markdown = await _parse_file_to_markdown(temp_file_path, file_name)
return markdown, drive_metadata, None
except Exception as e:
logger.warning(f"Failed to extract content from {file_name}: {e!s}")
return None, drive_metadata, str(e)
finally:
if temp_file_path and os.path.exists(temp_file_path):
try:
os.unlink(temp_file_path)
except Exception:
pass
async def _parse_file_to_markdown(file_path: str, filename: str) -> str:
"""Parse a local file to markdown using the configured ETL service."""
lower = filename.lower()
if lower.endswith((".md", ".markdown", ".txt")):
with open(file_path, encoding="utf-8") as f:
return f.read()
if lower.endswith((".mp3", ".mp4", ".mpeg", ".mpga", ".m4a", ".wav", ".webm")):
from app.config import config as app_config
from litellm import atranscription
stt_service_type = (
"local"
if app_config.STT_SERVICE and app_config.STT_SERVICE.startswith("local/")
else "external"
)
if stt_service_type == "local":
from app.services.stt_service import stt_service
result = stt_service.transcribe_file(file_path)
text = result.get("text", "")
else:
with open(file_path, "rb") as audio_file:
kwargs: dict[str, Any] = {
"model": app_config.STT_SERVICE,
"file": audio_file,
"api_key": app_config.STT_SERVICE_API_KEY,
}
if app_config.STT_SERVICE_API_BASE:
kwargs["api_base"] = app_config.STT_SERVICE_API_BASE
resp = await atranscription(**kwargs)
text = resp.get("text", "")
if not text:
raise ValueError("Transcription returned empty text")
return f"# Transcription of {filename}\n\n{text}"
# Document files -- use configured ETL service
from app.config import config as app_config
if app_config.ETL_SERVICE == "UNSTRUCTURED":
from langchain_unstructured import UnstructuredLoader
from app.utils.document_converters import convert_document_to_markdown
loader = UnstructuredLoader(
file_path,
mode="elements",
post_processors=[],
languages=["eng"],
include_orig_elements=False,
include_metadata=False,
strategy="auto",
)
docs = await loader.aload()
return await convert_document_to_markdown(docs)
if app_config.ETL_SERVICE == "LLAMACLOUD":
from app.tasks.document_processors.file_processors import (
parse_with_llamacloud_retry,
)
result = await parse_with_llamacloud_retry(file_path=file_path, estimated_pages=50)
markdown_documents = await result.aget_markdown_documents(split_by_page=False)
if not markdown_documents:
raise RuntimeError(f"LlamaCloud returned no documents for {filename}")
return markdown_documents[0].text
if app_config.ETL_SERVICE == "DOCLING":
from docling.document_converter import DocumentConverter
converter = DocumentConverter()
result = converter.convert(file_path)
return result.document.export_to_markdown()
raise RuntimeError(f"Unknown ETL_SERVICE: {app_config.ETL_SERVICE}")
async def download_and_process_file( async def download_and_process_file(
client: GoogleDriveClient, client: GoogleDriveClient,
file: dict[str, Any], file: dict[str, Any],

View file

@ -3,10 +3,17 @@ import hashlib
from app.indexing_pipeline.connector_document import ConnectorDocument from app.indexing_pipeline.connector_document import ConnectorDocument
def compute_identifier_hash(
document_type_value: str, unique_id: str, search_space_id: int
) -> str:
"""Return a stable SHA-256 hash from raw identity components."""
combined = f"{document_type_value}:{unique_id}:{search_space_id}"
return hashlib.sha256(combined.encode("utf-8")).hexdigest()
def compute_unique_identifier_hash(doc: ConnectorDocument) -> str: def compute_unique_identifier_hash(doc: ConnectorDocument) -> str:
"""Return a stable SHA-256 hash identifying a document by its source identity.""" """Return a stable SHA-256 hash identifying a document by its source identity."""
combined = f"{doc.document_type.value}:{doc.unique_id}:{doc.search_space_id}" return compute_identifier_hash(doc.document_type.value, doc.unique_id, doc.search_space_id)
return hashlib.sha256(combined.encode("utf-8")).hexdigest()
def compute_content_hash(doc: ConnectorDocument) -> str: def compute_content_hash(doc: ConnectorDocument) -> str:

View file

@ -6,12 +6,13 @@ from sqlalchemy import delete, select
from sqlalchemy.exc import IntegrityError from sqlalchemy.exc import IntegrityError
from sqlalchemy.ext.asyncio import AsyncSession from sqlalchemy.ext.asyncio import AsyncSession
from app.db import Chunk, Document, DocumentStatus from app.db import NATIVE_TO_LEGACY_DOCTYPE, Chunk, Document, DocumentStatus
from app.indexing_pipeline.connector_document import ConnectorDocument from app.indexing_pipeline.connector_document import ConnectorDocument
from app.indexing_pipeline.document_chunker import chunk_text from app.indexing_pipeline.document_chunker import chunk_text
from app.indexing_pipeline.document_embedder import embed_texts from app.indexing_pipeline.document_embedder import embed_texts
from app.indexing_pipeline.document_hashing import ( from app.indexing_pipeline.document_hashing import (
compute_content_hash, compute_content_hash,
compute_identifier_hash,
compute_unique_identifier_hash, compute_unique_identifier_hash,
) )
from app.indexing_pipeline.document_persistence import ( from app.indexing_pipeline.document_persistence import (
@ -54,6 +55,62 @@ class IndexingPipelineService:
def __init__(self, session: AsyncSession) -> None: def __init__(self, session: AsyncSession) -> None:
self.session = session self.session = session
async def migrate_legacy_docs(
self, connector_docs: list[ConnectorDocument]
) -> None:
"""Migrate legacy Composio documents to their native Google type.
For each ConnectorDocument whose document_type has a Composio equivalent
in NATIVE_TO_LEGACY_DOCTYPE, look up the old document by legacy hash and
update its unique_identifier_hash and document_type so that
prepare_for_indexing() can find it under the native hash.
"""
for doc in connector_docs:
legacy_type = NATIVE_TO_LEGACY_DOCTYPE.get(doc.document_type.value)
if not legacy_type:
continue
legacy_hash = compute_identifier_hash(
legacy_type, doc.unique_id, doc.search_space_id
)
result = await self.session.execute(
select(Document).filter(
Document.unique_identifier_hash == legacy_hash
)
)
existing = result.scalars().first()
if existing is None:
continue
native_hash = compute_identifier_hash(
doc.document_type.value, doc.unique_id, doc.search_space_id
)
existing.unique_identifier_hash = native_hash
existing.document_type = doc.document_type
await self.session.commit()
async def index_batch(
self, connector_docs: list[ConnectorDocument], llm
) -> list[Document]:
"""Convenience method: prepare_for_indexing then index each document.
Indexers that need heartbeat callbacks or custom per-document logic
should call prepare_for_indexing() + index() directly instead.
"""
doc_map = {
compute_unique_identifier_hash(cd): cd for cd in connector_docs
}
documents = await self.prepare_for_indexing(connector_docs)
results: list[Document] = []
for document in documents:
connector_doc = doc_map.get(document.unique_identifier_hash)
if connector_doc is None:
continue
result = await self.index(document, connector_doc, llm)
results.append(result)
return results
async def prepare_for_indexing( async def prepare_for_indexing(
self, connector_docs: list[ConnectorDocument] self, connector_docs: list[ConnectorDocument]
) -> list[Document]: ) -> list[Document]:

View file

@ -1,9 +1,8 @@
""" """
Google Calendar connector indexer. Google Calendar connector indexer.
Implements 2-phase document status updates for real-time UI feedback: Uses the shared IndexingPipelineService for document deduplication,
- Phase 1: Create all documents with 'pending' status (visible in UI immediately) summarization, chunking, and embedding.
- Phase 2: Process each document: pending processing ready/failed
""" """
import time import time
@ -15,29 +14,25 @@ from sqlalchemy.exc import SQLAlchemyError
from sqlalchemy.ext.asyncio import AsyncSession from sqlalchemy.ext.asyncio import AsyncSession
from app.connectors.google_calendar_connector import GoogleCalendarConnector from app.connectors.google_calendar_connector import GoogleCalendarConnector
from app.db import Document, DocumentStatus, DocumentType, SearchSourceConnectorType from app.db import DocumentType, SearchSourceConnectorType
from app.indexing_pipeline.connector_document import ConnectorDocument
from app.indexing_pipeline.document_hashing import (
compute_content_hash,
compute_unique_identifier_hash,
)
from app.indexing_pipeline.indexing_pipeline_service import IndexingPipelineService
from app.services.llm_service import get_user_long_context_llm from app.services.llm_service import get_user_long_context_llm
from app.services.task_logging_service import TaskLoggingService from app.services.task_logging_service import TaskLoggingService
from app.utils.document_converters import (
create_document_chunks,
embed_text,
generate_content_hash,
generate_document_summary,
generate_unique_identifier_hash,
)
from app.utils.google_credentials import ( from app.utils.google_credentials import (
COMPOSIO_GOOGLE_CONNECTOR_TYPES, COMPOSIO_GOOGLE_CONNECTOR_TYPES,
build_composio_credentials, build_composio_credentials,
) )
from .base import ( from .base import (
check_document_by_unique_identifier,
check_duplicate_document_by_hash, check_duplicate_document_by_hash,
get_connector_by_id, get_connector_by_id,
get_current_timestamp,
logger, logger,
parse_date_flexible, parse_date_flexible,
safe_set_chunks,
update_connector_last_indexed, update_connector_last_indexed,
) )
@ -46,13 +41,60 @@ ACCEPTED_CALENDAR_CONNECTOR_TYPES = {
SearchSourceConnectorType.COMPOSIO_GOOGLE_CALENDAR_CONNECTOR, SearchSourceConnectorType.COMPOSIO_GOOGLE_CALENDAR_CONNECTOR,
} }
# Type hint for heartbeat callback
HeartbeatCallbackType = Callable[[int], Awaitable[None]] HeartbeatCallbackType = Callable[[int], Awaitable[None]]
# Heartbeat interval in seconds
HEARTBEAT_INTERVAL_SECONDS = 30 HEARTBEAT_INTERVAL_SECONDS = 30
def _build_connector_doc(
event: dict,
event_markdown: str,
*,
connector_id: int,
search_space_id: int,
user_id: str,
enable_summary: bool,
) -> ConnectorDocument:
"""Map a raw Google Calendar API event dict to a ConnectorDocument."""
event_id = event.get("id", "")
event_summary = event.get("summary", "No Title")
calendar_id = event.get("calendarId", "")
start = event.get("start", {})
end = event.get("end", {})
start_time = start.get("dateTime") or start.get("date", "")
end_time = end.get("dateTime") or end.get("date", "")
location = event.get("location", "")
metadata = {
"event_id": event_id,
"event_summary": event_summary,
"calendar_id": calendar_id,
"start_time": start_time,
"end_time": end_time,
"location": location,
"connector_id": connector_id,
"document_type": "Google Calendar Event",
"connector_type": "Google Calendar",
}
fallback_summary = (
f"Google Calendar Event: {event_summary}\n\n{event_markdown}"
)
return ConnectorDocument(
title=event_summary,
source_markdown=event_markdown,
unique_id=event_id,
document_type=DocumentType.GOOGLE_CALENDAR_CONNECTOR,
search_space_id=search_space_id,
connector_id=connector_id,
created_by_id=user_id,
should_summarize=enable_summary,
fallback_summary=fallback_summary,
metadata=metadata,
)
async def index_google_calendar_events( async def index_google_calendar_events(
session: AsyncSession, session: AsyncSession,
connector_id: int, connector_id: int,
@ -82,7 +124,6 @@ async def index_google_calendar_events(
""" """
task_logger = TaskLoggingService(session, search_space_id) task_logger = TaskLoggingService(session, search_space_id)
# Log task start
log_entry = await task_logger.log_task_start( log_entry = await task_logger.log_task_start(
task_name="google_calendar_events_indexing", task_name="google_calendar_events_indexing",
source="connector_indexing_task", source="connector_indexing_task",
@ -96,7 +137,7 @@ async def index_google_calendar_events(
) )
try: try:
# Accept both native and Composio Calendar connectors # ── Connector lookup ──────────────────────────────────────────
connector = None connector = None
for ct in ACCEPTED_CALENDAR_CONNECTOR_TYPES: for ct in ACCEPTED_CALENDAR_CONNECTOR_TYPES:
connector = await get_connector_by_id(session, connector_id, ct) connector = await get_connector_by_id(session, connector_id, ct)
@ -112,7 +153,7 @@ async def index_google_calendar_events(
) )
return 0, 0, f"Connector with ID {connector_id} not found" return 0, 0, f"Connector with ID {connector_id} not found"
# Build credentials based on connector type # ── Credential building ───────────────────────────────────────
if connector.connector_type in COMPOSIO_GOOGLE_CONNECTOR_TYPES: if connector.connector_type in COMPOSIO_GOOGLE_CONNECTOR_TYPES:
connected_account_id = connector.config.get("composio_connected_account_id") connected_account_id = connector.config.get("composio_connected_account_id")
if not connected_account_id: if not connected_account_id:
@ -184,6 +225,7 @@ async def index_google_calendar_events(
) )
return 0, 0, "Google Calendar credentials not found in connector config" return 0, 0, "Google Calendar credentials not found in connector config"
# ── Calendar client init ──────────────────────────────────────
await task_logger.log_task_progress( await task_logger.log_task_progress(
log_entry, log_entry,
f"Initializing Google Calendar client for connector {connector_id}", f"Initializing Google Calendar client for connector {connector_id}",
@ -203,36 +245,26 @@ async def index_google_calendar_events(
if end_date == "undefined" or end_date == "": if end_date == "undefined" or end_date == "":
end_date = None end_date = None
# Calculate date range # ── Date range calculation ────────────────────────────────────
# For calendar connectors, allow future dates to index upcoming events
if start_date is None or end_date is None: if start_date is None or end_date is None:
# Fall back to calculating dates based on last_indexed_at
# Default to today (users can manually select future dates if needed)
calculated_end_date = datetime.now() calculated_end_date = datetime.now()
# Use last_indexed_at as start date if available, otherwise use 30 days ago
if connector.last_indexed_at: if connector.last_indexed_at:
# Convert dates to be comparable (both timezone-naive)
last_indexed_naive = ( last_indexed_naive = (
connector.last_indexed_at.replace(tzinfo=None) connector.last_indexed_at.replace(tzinfo=None)
if connector.last_indexed_at.tzinfo if connector.last_indexed_at.tzinfo
else connector.last_indexed_at else connector.last_indexed_at
) )
# Allow future dates - use last_indexed_at as start date
calculated_start_date = last_indexed_naive calculated_start_date = last_indexed_naive
logger.info( logger.info(
f"Using last_indexed_at ({calculated_start_date.strftime('%Y-%m-%d')}) as start date" f"Using last_indexed_at ({calculated_start_date.strftime('%Y-%m-%d')}) as start date"
) )
else: else:
calculated_start_date = datetime.now() - timedelta( calculated_start_date = datetime.now() - timedelta(days=365)
days=365
) # Use 365 days as default for calendar events (matches frontend)
logger.info( logger.info(
f"No last_indexed_at found, using {calculated_start_date.strftime('%Y-%m-%d')} (365 days ago) as start date" f"No last_indexed_at found, using {calculated_start_date.strftime('%Y-%m-%d')} (365 days ago) as start date"
) )
# Use calculated dates if not provided
start_date_str = ( start_date_str = (
start_date if start_date else calculated_start_date.strftime("%Y-%m-%d") start_date if start_date else calculated_start_date.strftime("%Y-%m-%d")
) )
@ -240,19 +272,14 @@ async def index_google_calendar_events(
end_date if end_date else calculated_end_date.strftime("%Y-%m-%d") end_date if end_date else calculated_end_date.strftime("%Y-%m-%d")
) )
else: else:
# Use provided dates (including future dates)
start_date_str = start_date start_date_str = start_date
end_date_str = end_date end_date_str = end_date
# FIX: Ensure end_date is at least 1 day after start_date to avoid
# "start_date must be strictly before end_date" errors when dates are the same
# (e.g., when last_indexed_at is today)
if start_date_str == end_date_str: if start_date_str == end_date_str:
logger.info( logger.info(
f"Start date ({start_date_str}) equals end date ({end_date_str}), " f"Start date ({start_date_str}) equals end date ({end_date_str}), "
"adjusting end date to next day to ensure valid date range" "adjusting end date to next day to ensure valid date range"
) )
# Parse end_date and add 1 day
try: try:
end_dt = parse_date_flexible(end_date_str) end_dt = parse_date_flexible(end_date_str)
except ValueError: except ValueError:
@ -264,6 +291,7 @@ async def index_google_calendar_events(
end_date_str = end_dt.strftime("%Y-%m-%d") end_date_str = end_dt.strftime("%Y-%m-%d")
logger.info(f"Adjusted end date to {end_date_str}") logger.info(f"Adjusted end date to {end_date_str}")
# ── Fetch events ──────────────────────────────────────────────
await task_logger.log_task_progress( await task_logger.log_task_progress(
log_entry, log_entry,
f"Fetching Google Calendar events from {start_date_str} to {end_date_str}", f"Fetching Google Calendar events from {start_date_str} to {end_date_str}",
@ -274,27 +302,19 @@ async def index_google_calendar_events(
}, },
) )
# Get events within date range from primary calendar
try: try:
events, error = await calendar_client.get_all_primary_calendar_events( events, error = await calendar_client.get_all_primary_calendar_events(
start_date=start_date_str, end_date=end_date_str start_date=start_date_str, end_date=end_date_str
) )
if error: if error:
# Don't treat "No events found" as an error that should stop indexing
if "No events found" in error: if "No events found" in error:
logger.info(f"No Google Calendar events found: {error}") logger.info(f"No Google Calendar events found: {error}")
logger.info(
"No events found is not a critical error, continuing with update"
)
if update_last_indexed: if update_last_indexed:
await update_connector_last_indexed( await update_connector_last_indexed(
session, connector, update_last_indexed session, connector, update_last_indexed
) )
await session.commit() await session.commit()
logger.info(
f"Updated last_indexed_at to {connector.last_indexed_at} despite no events found"
)
await task_logger.log_task_success( await task_logger.log_task_success(
log_entry, log_entry,
@ -304,7 +324,6 @@ async def index_google_calendar_events(
return 0, 0, None return 0, 0, None
else: else:
logger.error(f"Failed to get Google Calendar events: {error}") logger.error(f"Failed to get Google Calendar events: {error}")
# Check if this is an authentication error that requires re-authentication
error_message = error error_message = error
error_type = "APIError" error_type = "APIError"
if ( if (
@ -329,28 +348,15 @@ async def index_google_calendar_events(
logger.error(f"Error fetching Google Calendar events: {e!s}", exc_info=True) logger.error(f"Error fetching Google Calendar events: {e!s}", exc_info=True)
return 0, 0, f"Error fetching Google Calendar events: {e!s}" return 0, 0, f"Error fetching Google Calendar events: {e!s}"
documents_indexed = 0 # ── Build ConnectorDocuments ──────────────────────────────────
connector_docs: list[ConnectorDocument] = []
documents_skipped = 0 documents_skipped = 0
documents_failed = 0 # Track events that failed processing duplicate_content_count = 0
duplicate_content_count = (
0 # Track events skipped due to duplicate content_hash
)
# Heartbeat tracking - update notification periodically to prevent appearing stuck
last_heartbeat_time = time.time()
# =======================================================================
# PHASE 1: Analyze all events, create pending documents
# This makes ALL documents visible in the UI immediately with pending status
# =======================================================================
events_to_process = [] # List of dicts with document and event data
new_documents_created = False
for event in events: for event in events:
try: try:
event_id = event.get("id") event_id = event.get("id")
event_summary = event.get("summary", "No Title") event_summary = event.get("summary", "No Title")
calendar_id = event.get("calendarId", "")
if not event_id: if not event_id:
logger.warning(f"Skipping event with missing ID: {event_summary}") logger.warning(f"Skipping event with missing ID: {event_summary}")
@ -363,223 +369,73 @@ async def index_google_calendar_events(
documents_skipped += 1 documents_skipped += 1
continue continue
start = event.get("start", {}) doc = _build_connector_doc(
end = event.get("end", {}) event,
start_time = start.get("dateTime") or start.get("date", "") event_markdown,
end_time = end.get("dateTime") or end.get("date", "") connector_id=connector_id,
location = event.get("location", "") search_space_id=search_space_id,
description = event.get("description", "") user_id=user_id,
enable_summary=connector.enable_summary,
# Generate unique identifier hash for this Google Calendar event
unique_identifier_hash = generate_unique_identifier_hash(
DocumentType.GOOGLE_CALENDAR_CONNECTOR, event_id, search_space_id
) )
# Generate content hash
content_hash = generate_content_hash(event_markdown, search_space_id)
# Check if document with this unique identifier already exists
existing_document = await check_document_by_unique_identifier(
session, unique_identifier_hash
)
# Fallback: legacy Composio hash
if not existing_document:
legacy_hash = generate_unique_identifier_hash(
DocumentType.COMPOSIO_GOOGLE_CALENDAR_CONNECTOR,
event_id,
search_space_id,
)
existing_document = await check_document_by_unique_identifier(
session, legacy_hash
)
if existing_document:
existing_document.unique_identifier_hash = (
unique_identifier_hash
)
if (
existing_document.document_type
== DocumentType.COMPOSIO_GOOGLE_CALENDAR_CONNECTOR
):
existing_document.document_type = (
DocumentType.GOOGLE_CALENDAR_CONNECTOR
)
logger.info(
f"Migrated legacy Composio Calendar document: {event_id}"
)
if existing_document:
# Document exists - check if content has changed
if existing_document.content_hash == content_hash:
# Ensure status is ready (might have been stuck in processing/pending)
if not DocumentStatus.is_state(
existing_document.status, DocumentStatus.READY
):
existing_document.status = DocumentStatus.ready()
documents_skipped += 1
continue
# Queue existing document for update (will be set to processing in Phase 2)
events_to_process.append(
{
"document": existing_document,
"is_new": False,
"event_markdown": event_markdown,
"content_hash": content_hash,
"event_id": event_id,
"event_summary": event_summary,
"calendar_id": calendar_id,
"start_time": start_time,
"end_time": end_time,
"location": location,
"description": description,
}
)
continue
# Document doesn't exist by unique_identifier_hash
# Check if a document with the same content_hash exists (from another connector)
with session.no_autoflush: with session.no_autoflush:
duplicate_by_content = await check_duplicate_document_by_hash( duplicate = await check_duplicate_document_by_hash(
session, content_hash session, compute_content_hash(doc)
) )
if duplicate:
if duplicate_by_content:
# A document with the same content already exists (likely from Composio connector)
logger.info( logger.info(
f"Event {event_summary} already indexed by another connector " f"Event {doc.title} already indexed by another connector "
f"(existing document ID: {duplicate_by_content.id}, " f"(existing document ID: {duplicate.id}, "
f"type: {duplicate_by_content.document_type}). Skipping to avoid duplicate content." f"type: {duplicate.document_type}). Skipping."
) )
duplicate_content_count += 1 duplicate_content_count += 1
documents_skipped += 1 documents_skipped += 1
continue continue
# Create new document with PENDING status (visible in UI immediately) connector_docs.append(doc)
document = Document(
search_space_id=search_space_id,
title=event_summary,
document_type=DocumentType.GOOGLE_CALENDAR_CONNECTOR,
document_metadata={
"event_id": event_id,
"event_summary": event_summary,
"calendar_id": calendar_id,
"start_time": start_time,
"end_time": end_time,
"location": location,
"connector_id": connector_id,
},
content="Pending...", # Placeholder until processed
content_hash=unique_identifier_hash, # Temporary unique value - updated when ready
unique_identifier_hash=unique_identifier_hash,
embedding=None,
chunks=[], # Empty at creation - safe for async
status=DocumentStatus.pending(), # Pending until processing starts
updated_at=get_current_timestamp(),
created_by_id=user_id,
connector_id=connector_id,
)
session.add(document)
new_documents_created = True
events_to_process.append(
{
"document": document,
"is_new": True,
"event_markdown": event_markdown,
"content_hash": content_hash,
"event_id": event_id,
"event_summary": event_summary,
"calendar_id": calendar_id,
"start_time": start_time,
"end_time": end_time,
"location": location,
"description": description,
}
)
except Exception as e: except Exception as e:
logger.error(f"Error in Phase 1 for event: {e!s}", exc_info=True) logger.error(f"Error building ConnectorDocument for event: {e!s}", exc_info=True)
documents_failed += 1 documents_skipped += 1
continue continue
# Commit all pending documents - they all appear in UI now # ── Pipeline: migrate legacy docs + prepare + index ───────────
if new_documents_created: pipeline = IndexingPipelineService(session)
logger.info(
f"Phase 1: Committing {len([e for e in events_to_process if e['is_new']])} pending documents"
)
await session.commit()
# ======================================================================= await pipeline.migrate_legacy_docs(connector_docs)
# PHASE 2: Process each document one by one
# Each document transitions: pending → processing → ready/failed
# =======================================================================
logger.info(f"Phase 2: Processing {len(events_to_process)} documents")
for item in events_to_process: documents = await pipeline.prepare_for_indexing(connector_docs)
# Send heartbeat periodically
doc_map = {
compute_unique_identifier_hash(cd): cd for cd in connector_docs
}
documents_indexed = 0
documents_failed = 0
last_heartbeat_time = time.time()
for document in documents:
if on_heartbeat_callback: if on_heartbeat_callback:
current_time = time.time() current_time = time.time()
if current_time - last_heartbeat_time >= HEARTBEAT_INTERVAL_SECONDS: if current_time - last_heartbeat_time >= HEARTBEAT_INTERVAL_SECONDS:
await on_heartbeat_callback(documents_indexed) await on_heartbeat_callback(documents_indexed)
last_heartbeat_time = current_time last_heartbeat_time = current_time
document = item["document"] connector_doc = doc_map.get(document.unique_identifier_hash)
try: if connector_doc is None:
# Set to PROCESSING and commit - shows "processing" in UI for THIS document only logger.warning(
document.status = DocumentStatus.processing() f"No matching ConnectorDocument for document {document.id}, skipping"
await session.commit() )
documents_failed += 1
continue
# Heavy processing (LLM, embeddings, chunks) try:
user_llm = await get_user_long_context_llm( user_llm = await get_user_long_context_llm(
session, user_id, search_space_id session, user_id, search_space_id
) )
await pipeline.index(document, connector_doc, user_llm)
if user_llm and connector.enable_summary:
document_metadata_for_summary = {
"event_id": item["event_id"],
"event_summary": item["event_summary"],
"calendar_id": item["calendar_id"],
"start_time": item["start_time"],
"end_time": item["end_time"],
"location": item["location"] or "No location",
"document_type": "Google Calendar Event",
"connector_type": "Google Calendar",
}
(
summary_content,
summary_embedding,
) = await generate_document_summary(
item["event_markdown"], user_llm, document_metadata_for_summary
)
else:
summary_content = f"Google Calendar Event: {item['event_summary']}\n\n{item['event_markdown']}"
summary_embedding = embed_text(summary_content)
chunks = await create_document_chunks(item["event_markdown"])
# Update document to READY with actual content
document.title = item["event_summary"]
document.content = summary_content
document.content_hash = item["content_hash"]
document.embedding = summary_embedding
document.document_metadata = {
"event_id": item["event_id"],
"event_summary": item["event_summary"],
"calendar_id": item["calendar_id"],
"start_time": item["start_time"],
"end_time": item["end_time"],
"location": item["location"],
"indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
"connector_id": connector_id,
}
await safe_set_chunks(session, document, chunks)
document.updated_at = get_current_timestamp()
document.status = DocumentStatus.ready()
documents_indexed += 1 documents_indexed += 1
# Batch commit every 10 documents (for ready status updates)
if documents_indexed % 10 == 0: if documents_indexed % 10 == 0:
logger.info( logger.info(
f"Committing batch: {documents_indexed} Google Calendar events processed so far" f"Committing batch: {documents_indexed} Google Calendar events processed so far"
@ -588,21 +444,12 @@ async def index_google_calendar_events(
except Exception as e: except Exception as e:
logger.error(f"Error processing Calendar event: {e!s}", exc_info=True) logger.error(f"Error processing Calendar event: {e!s}", exc_info=True)
# Mark document as failed with reason (visible in UI)
try:
document.status = DocumentStatus.failed(str(e))
document.updated_at = get_current_timestamp()
except Exception as status_error:
logger.error(
f"Failed to update document status to failed: {status_error}"
)
documents_failed += 1 documents_failed += 1
continue continue
# CRITICAL: Always update timestamp (even if 0 documents indexed) so Zero syncs # ── Finalize ──────────────────────────────────────────────────
await update_connector_last_indexed(session, connector, update_last_indexed) await update_connector_last_indexed(session, connector, update_last_indexed)
# Final commit for any remaining documents not yet committed in batches
logger.info( logger.info(
f"Final commit: Total {documents_indexed} Google Calendar events processed" f"Final commit: Total {documents_indexed} Google Calendar events processed"
) )
@ -612,22 +459,18 @@ async def index_google_calendar_events(
"Successfully committed all Google Calendar document changes to database" "Successfully committed all Google Calendar document changes to database"
) )
except Exception as e: except Exception as e:
# Handle any remaining integrity errors gracefully (race conditions, etc.)
if ( if (
"duplicate key value violates unique constraint" in str(e).lower() "duplicate key value violates unique constraint" in str(e).lower()
or "uniqueviolationerror" in str(e).lower() or "uniqueviolationerror" in str(e).lower()
): ):
logger.warning( logger.warning(
f"Duplicate content_hash detected during final commit. " f"Duplicate content_hash detected during final commit. "
f"This may occur if the same event was indexed by multiple connectors. "
f"Rolling back and continuing. Error: {e!s}" f"Rolling back and continuing. Error: {e!s}"
) )
await session.rollback() await session.rollback()
# Don't fail the entire task - some documents may have been successfully indexed
else: else:
raise raise
# Build warning message if there were issues
warning_parts = [] warning_parts = []
if duplicate_content_count > 0: if duplicate_content_count > 0:
warning_parts.append(f"{duplicate_content_count} duplicate") warning_parts.append(f"{duplicate_content_count} duplicate")

View file

@ -1,11 +1,11 @@
""" """
Google Gmail connector indexer. Google Gmail connector indexer.
Implements 2-phase document status updates for real-time UI feedback: Uses the shared IndexingPipelineService for document deduplication,
- Phase 1: Create all documents with 'pending' status (visible in UI immediately) summarization, chunking, and embedding.
- Phase 2: Process each document: pending processing ready/failed
""" """
import logging
import time import time
from collections.abc import Awaitable, Callable from collections.abc import Awaitable, Callable
from datetime import datetime from datetime import datetime
@ -15,21 +15,15 @@ from sqlalchemy.exc import SQLAlchemyError
from sqlalchemy.ext.asyncio import AsyncSession from sqlalchemy.ext.asyncio import AsyncSession
from app.connectors.google_gmail_connector import GoogleGmailConnector from app.connectors.google_gmail_connector import GoogleGmailConnector
from app.db import ( from app.db import DocumentType, SearchSourceConnectorType
Document, from app.indexing_pipeline.connector_document import ConnectorDocument
DocumentStatus, from app.indexing_pipeline.document_hashing import (
DocumentType, compute_content_hash,
SearchSourceConnectorType, compute_unique_identifier_hash,
) )
from app.indexing_pipeline.indexing_pipeline_service import IndexingPipelineService
from app.services.llm_service import get_user_long_context_llm from app.services.llm_service import get_user_long_context_llm
from app.services.task_logging_service import TaskLoggingService from app.services.task_logging_service import TaskLoggingService
from app.utils.document_converters import (
create_document_chunks,
embed_text,
generate_content_hash,
generate_document_summary,
generate_unique_identifier_hash,
)
from app.utils.google_credentials import ( from app.utils.google_credentials import (
COMPOSIO_GOOGLE_CONNECTOR_TYPES, COMPOSIO_GOOGLE_CONNECTOR_TYPES,
build_composio_credentials, build_composio_credentials,
@ -37,12 +31,9 @@ from app.utils.google_credentials import (
from .base import ( from .base import (
calculate_date_range, calculate_date_range,
check_document_by_unique_identifier,
check_duplicate_document_by_hash, check_duplicate_document_by_hash,
get_connector_by_id, get_connector_by_id,
get_current_timestamp,
logger, logger,
safe_set_chunks,
update_connector_last_indexed, update_connector_last_indexed,
) )
@ -51,13 +42,70 @@ ACCEPTED_GMAIL_CONNECTOR_TYPES = {
SearchSourceConnectorType.COMPOSIO_GMAIL_CONNECTOR, SearchSourceConnectorType.COMPOSIO_GMAIL_CONNECTOR,
} }
# Type hint for heartbeat callback
HeartbeatCallbackType = Callable[[int], Awaitable[None]] HeartbeatCallbackType = Callable[[int], Awaitable[None]]
# Heartbeat interval in seconds
HEARTBEAT_INTERVAL_SECONDS = 30 HEARTBEAT_INTERVAL_SECONDS = 30
def _build_connector_doc(
message: dict,
markdown_content: str,
*,
connector_id: int,
search_space_id: int,
user_id: str,
enable_summary: bool,
) -> ConnectorDocument:
"""Map a raw Gmail API message dict to a ConnectorDocument."""
message_id = message.get("id", "")
thread_id = message.get("threadId", "")
payload = message.get("payload", {})
headers = payload.get("headers", [])
subject = "No Subject"
sender = "Unknown Sender"
date_str = "Unknown Date"
for header in headers:
name = header.get("name", "").lower()
value = header.get("value", "")
if name == "subject":
subject = value
elif name == "from":
sender = value
elif name == "date":
date_str = value
metadata = {
"message_id": message_id,
"thread_id": thread_id,
"subject": subject,
"sender": sender,
"date": date_str,
"connector_id": connector_id,
"document_type": "Gmail Message",
"connector_type": "Google Gmail",
}
fallback_summary = (
f"Google Gmail Message: {subject}\n\n"
f"From: {sender}\nDate: {date_str}\n\n"
f"{markdown_content}"
)
return ConnectorDocument(
title=subject,
source_markdown=markdown_content,
unique_id=message_id,
document_type=DocumentType.GOOGLE_GMAIL_CONNECTOR,
search_space_id=search_space_id,
connector_id=connector_id,
created_by_id=user_id,
should_summarize=enable_summary,
fallback_summary=fallback_summary,
metadata=metadata,
)
async def index_google_gmail_messages( async def index_google_gmail_messages(
session: AsyncSession, session: AsyncSession,
connector_id: int, connector_id: int,
@ -80,7 +128,7 @@ async def index_google_gmail_messages(
start_date: Start date for filtering messages (YYYY-MM-DD format) start_date: Start date for filtering messages (YYYY-MM-DD format)
end_date: End date for filtering messages (YYYY-MM-DD format) end_date: End date for filtering messages (YYYY-MM-DD format)
update_last_indexed: Whether to update the last_indexed_at timestamp (default: True) update_last_indexed: Whether to update the last_indexed_at timestamp (default: True)
max_messages: Maximum number of messages to fetch (default: 100) max_messages: Maximum number of messages to fetch (default: 1000)
on_heartbeat_callback: Optional callback to update notification during long-running indexing. on_heartbeat_callback: Optional callback to update notification during long-running indexing.
Returns: Returns:
@ -88,7 +136,6 @@ async def index_google_gmail_messages(
""" """
task_logger = TaskLoggingService(session, search_space_id) task_logger = TaskLoggingService(session, search_space_id)
# Log task start
log_entry = await task_logger.log_task_start( log_entry = await task_logger.log_task_start(
task_name="google_gmail_messages_indexing", task_name="google_gmail_messages_indexing",
source="connector_indexing_task", source="connector_indexing_task",
@ -103,7 +150,7 @@ async def index_google_gmail_messages(
) )
try: try:
# Accept both native and Composio Gmail connectors # ── Connector lookup ──────────────────────────────────────────
connector = None connector = None
for ct in ACCEPTED_GMAIL_CONNECTOR_TYPES: for ct in ACCEPTED_GMAIL_CONNECTOR_TYPES:
connector = await get_connector_by_id(session, connector_id, ct) connector = await get_connector_by_id(session, connector_id, ct)
@ -117,7 +164,7 @@ async def index_google_gmail_messages(
) )
return 0, 0, error_msg return 0, 0, error_msg
# Build credentials based on connector type # ── Credential building ───────────────────────────────────────
if connector.connector_type in COMPOSIO_GOOGLE_CONNECTOR_TYPES: if connector.connector_type in COMPOSIO_GOOGLE_CONNECTOR_TYPES:
connected_account_id = connector.config.get("composio_connected_account_id") connected_account_id = connector.config.get("composio_connected_account_id")
if not connected_account_id: if not connected_account_id:
@ -189,6 +236,7 @@ async def index_google_gmail_messages(
) )
return 0, 0, "Google gmail credentials not found in connector config" return 0, 0, "Google gmail credentials not found in connector config"
# ── Gmail client init ─────────────────────────────────────────
await task_logger.log_task_progress( await task_logger.log_task_progress(
log_entry, log_entry,
f"Initializing Google gmail client for connector {connector_id}", f"Initializing Google gmail client for connector {connector_id}",
@ -199,14 +247,11 @@ async def index_google_gmail_messages(
credentials, session, user_id, connector_id credentials, session, user_id, connector_id
) )
# Calculate date range using last_indexed_at if dates not provided
# This ensures Gmail uses the same date logic as other connectors
# (uses last_indexed_at → now, or 365 days back for first-time indexing)
calculated_start_date, calculated_end_date = calculate_date_range( calculated_start_date, calculated_end_date = calculate_date_range(
connector, start_date, end_date, default_days_back=365 connector, start_date, end_date, default_days_back=365
) )
# Fetch recent Google gmail messages # ── Fetch messages ────────────────────────────────────────────
logger.info( logger.info(
f"Fetching emails for connector {connector_id} " f"Fetching emails for connector {connector_id} "
f"from {calculated_start_date} to {calculated_end_date}" f"from {calculated_start_date} to {calculated_end_date}"
@ -218,7 +263,6 @@ async def index_google_gmail_messages(
) )
if error: if error:
# Check if this is an authentication error that requires re-authentication
error_message = error error_message = error
error_type = "APIError" error_type = "APIError"
if ( if (
@ -243,263 +287,92 @@ async def index_google_gmail_messages(
logger.info(f"Found {len(messages)} Google gmail messages to index") logger.info(f"Found {len(messages)} Google gmail messages to index")
documents_indexed = 0 # ── Build ConnectorDocuments ──────────────────────────────────
connector_docs: list[ConnectorDocument] = []
documents_skipped = 0 documents_skipped = 0
documents_failed = 0 # Track messages that failed processing duplicate_content_count = 0
duplicate_content_count = (
0 # Track messages skipped due to duplicate content_hash
)
# Heartbeat tracking - update notification periodically to prevent appearing stuck
last_heartbeat_time = time.time()
# =======================================================================
# PHASE 1: Analyze all messages, create pending documents
# This makes ALL documents visible in the UI immediately with pending status
# =======================================================================
messages_to_process = [] # List of dicts with document and message data
new_documents_created = False
for message in messages: for message in messages:
try: try:
# Extract message information
message_id = message.get("id", "") message_id = message.get("id", "")
thread_id = message.get("threadId", "")
# Extract headers for subject and sender
payload = message.get("payload", {})
headers = payload.get("headers", [])
subject = "No Subject"
sender = "Unknown Sender"
date_str = "Unknown Date"
for header in headers:
name = header.get("name", "").lower()
value = header.get("value", "")
if name == "subject":
subject = value
elif name == "from":
sender = value
elif name == "date":
date_str = value
if not message_id: if not message_id:
logger.warning(f"Skipping message with missing ID: {subject}") logger.warning("Skipping message with missing ID")
documents_skipped += 1 documents_skipped += 1
continue continue
# Format message to markdown
markdown_content = gmail_connector.format_message_to_markdown(message) markdown_content = gmail_connector.format_message_to_markdown(message)
if not markdown_content.strip(): if not markdown_content.strip():
logger.warning(f"Skipping message with no content: {subject}") logger.warning(f"Skipping message with no content: {message_id}")
documents_skipped += 1 documents_skipped += 1
continue continue
# Generate unique identifier hash for this Gmail message doc = _build_connector_doc(
unique_identifier_hash = generate_unique_identifier_hash( message,
DocumentType.GOOGLE_GMAIL_CONNECTOR, message_id, search_space_id markdown_content,
connector_id=connector_id,
search_space_id=search_space_id,
user_id=user_id,
enable_summary=connector.enable_summary,
) )
# Generate content hash
content_hash = generate_content_hash(markdown_content, search_space_id)
# Check if document with this unique identifier already exists
existing_document = await check_document_by_unique_identifier(
session, unique_identifier_hash
)
# Fallback: legacy Composio hash
if not existing_document:
legacy_hash = generate_unique_identifier_hash(
DocumentType.COMPOSIO_GMAIL_CONNECTOR,
message_id,
search_space_id,
)
existing_document = await check_document_by_unique_identifier(
session, legacy_hash
)
if existing_document:
existing_document.unique_identifier_hash = (
unique_identifier_hash
)
if (
existing_document.document_type
== DocumentType.COMPOSIO_GMAIL_CONNECTOR
):
existing_document.document_type = (
DocumentType.GOOGLE_GMAIL_CONNECTOR
)
logger.info(
f"Migrated legacy Composio Gmail document: {message_id}"
)
if existing_document:
# Document exists - check if content has changed
if existing_document.content_hash == content_hash:
# Ensure status is ready (might have been stuck in processing/pending)
if not DocumentStatus.is_state(
existing_document.status, DocumentStatus.READY
):
existing_document.status = DocumentStatus.ready()
documents_skipped += 1
continue
# Queue existing document for update (will be set to processing in Phase 2)
messages_to_process.append(
{
"document": existing_document,
"is_new": False,
"markdown_content": markdown_content,
"content_hash": content_hash,
"message_id": message_id,
"thread_id": thread_id,
"subject": subject,
"sender": sender,
"date_str": date_str,
}
)
continue
# Document doesn't exist by unique_identifier_hash
# Check if a document with the same content_hash exists (from another connector)
with session.no_autoflush: with session.no_autoflush:
duplicate_by_content = await check_duplicate_document_by_hash( duplicate = await check_duplicate_document_by_hash(
session, content_hash session, compute_content_hash(doc)
) )
if duplicate:
if duplicate_by_content:
logger.info( logger.info(
f"Gmail message {subject} already indexed by another connector " f"Gmail message {doc.title} already indexed by another connector "
f"(existing document ID: {duplicate_by_content.id}, " f"(existing document ID: {duplicate.id}, "
f"type: {duplicate_by_content.document_type}). Skipping." f"type: {duplicate.document_type}). Skipping."
) )
duplicate_content_count += 1 duplicate_content_count += 1
documents_skipped += 1 documents_skipped += 1
continue continue
# Create new document with PENDING status (visible in UI immediately) connector_docs.append(doc)
document = Document(
search_space_id=search_space_id,
title=subject,
document_type=DocumentType.GOOGLE_GMAIL_CONNECTOR,
document_metadata={
"message_id": message_id,
"thread_id": thread_id,
"subject": subject,
"sender": sender,
"date": date_str,
"connector_id": connector_id,
},
content="Pending...", # Placeholder until processed
content_hash=unique_identifier_hash, # Temporary unique value - updated when ready
unique_identifier_hash=unique_identifier_hash,
embedding=None,
chunks=[], # Empty at creation - safe for async
status=DocumentStatus.pending(), # Pending until processing starts
updated_at=get_current_timestamp(),
created_by_id=user_id,
connector_id=connector_id,
)
session.add(document)
new_documents_created = True
messages_to_process.append(
{
"document": document,
"is_new": True,
"markdown_content": markdown_content,
"content_hash": content_hash,
"message_id": message_id,
"thread_id": thread_id,
"subject": subject,
"sender": sender,
"date_str": date_str,
}
)
except Exception as e: except Exception as e:
logger.error(f"Error in Phase 1 for message: {e!s}", exc_info=True) logger.error(f"Error building ConnectorDocument for message: {e!s}", exc_info=True)
documents_failed += 1 documents_skipped += 1
continue continue
# Commit all pending documents - they all appear in UI now # ── Pipeline: migrate legacy docs + prepare + index ───────────
if new_documents_created: pipeline = IndexingPipelineService(session)
logger.info(
f"Phase 1: Committing {len([m for m in messages_to_process if m['is_new']])} pending documents"
)
await session.commit()
# ======================================================================= await pipeline.migrate_legacy_docs(connector_docs)
# PHASE 2: Process each document one by one
# Each document transitions: pending → processing → ready/failed
# =======================================================================
logger.info(f"Phase 2: Processing {len(messages_to_process)} documents")
for item in messages_to_process: documents = await pipeline.prepare_for_indexing(connector_docs)
# Send heartbeat periodically
doc_map = {
compute_unique_identifier_hash(cd): cd for cd in connector_docs
}
documents_indexed = 0
documents_failed = 0
last_heartbeat_time = time.time()
for document in documents:
if on_heartbeat_callback: if on_heartbeat_callback:
current_time = time.time() current_time = time.time()
if current_time - last_heartbeat_time >= HEARTBEAT_INTERVAL_SECONDS: if current_time - last_heartbeat_time >= HEARTBEAT_INTERVAL_SECONDS:
await on_heartbeat_callback(documents_indexed) await on_heartbeat_callback(documents_indexed)
last_heartbeat_time = current_time last_heartbeat_time = current_time
document = item["document"] connector_doc = doc_map.get(document.unique_identifier_hash)
try: if connector_doc is None:
# Set to PROCESSING and commit - shows "processing" in UI for THIS document only logger.warning(
document.status = DocumentStatus.processing() f"No matching ConnectorDocument for document {document.id}, skipping"
await session.commit() )
documents_failed += 1
continue
# Heavy processing (LLM, embeddings, chunks) try:
user_llm = await get_user_long_context_llm( user_llm = await get_user_long_context_llm(
session, user_id, search_space_id session, user_id, search_space_id
) )
await pipeline.index(document, connector_doc, user_llm)
if user_llm and connector.enable_summary:
document_metadata_for_summary = {
"message_id": item["message_id"],
"thread_id": item["thread_id"],
"subject": item["subject"],
"sender": item["sender"],
"date": item["date_str"],
"document_type": "Gmail Message",
"connector_type": "Google Gmail",
}
(
summary_content,
summary_embedding,
) = await generate_document_summary(
item["markdown_content"],
user_llm,
document_metadata_for_summary,
)
else:
summary_content = f"Google Gmail Message: {item['subject']}\n\nFrom: {item['sender']}\nDate: {item['date_str']}\n\n{item['markdown_content']}"
summary_embedding = embed_text(summary_content)
chunks = await create_document_chunks(item["markdown_content"])
# Update document to READY with actual content
document.title = item["subject"]
document.content = summary_content
document.content_hash = item["content_hash"]
document.embedding = summary_embedding
document.document_metadata = {
"message_id": item["message_id"],
"thread_id": item["thread_id"],
"subject": item["subject"],
"sender": item["sender"],
"date": item["date_str"],
"connector_id": connector_id,
}
await safe_set_chunks(session, document, chunks)
document.updated_at = get_current_timestamp()
document.status = DocumentStatus.ready()
documents_indexed += 1 documents_indexed += 1
# Batch commit every 10 documents (for ready status updates)
if documents_indexed % 10 == 0: if documents_indexed % 10 == 0:
logger.info( logger.info(
f"Committing batch: {documents_indexed} Gmail messages processed so far" f"Committing batch: {documents_indexed} Gmail messages processed so far"
@ -508,21 +381,12 @@ async def index_google_gmail_messages(
except Exception as e: except Exception as e:
logger.error(f"Error processing Gmail message: {e!s}", exc_info=True) logger.error(f"Error processing Gmail message: {e!s}", exc_info=True)
# Mark document as failed with reason (visible in UI)
try:
document.status = DocumentStatus.failed(str(e))
document.updated_at = get_current_timestamp()
except Exception as status_error:
logger.error(
f"Failed to update document status to failed: {status_error}"
)
documents_failed += 1 documents_failed += 1
continue continue
# CRITICAL: Always update timestamp (even if 0 documents indexed) so Zero syncs # ── Finalize ──────────────────────────────────────────────────
await update_connector_last_indexed(session, connector, update_last_indexed) await update_connector_last_indexed(session, connector, update_last_indexed)
# Final commit for any remaining documents not yet committed in batches
logger.info(f"Final commit: Total {documents_indexed} Gmail messages processed") logger.info(f"Final commit: Total {documents_indexed} Gmail messages processed")
try: try:
await session.commit() await session.commit()
@ -530,22 +394,18 @@ async def index_google_gmail_messages(
"Successfully committed all Google Gmail document changes to database" "Successfully committed all Google Gmail document changes to database"
) )
except Exception as e: except Exception as e:
# Handle any remaining integrity errors gracefully (race conditions, etc.)
if ( if (
"duplicate key value violates unique constraint" in str(e).lower() "duplicate key value violates unique constraint" in str(e).lower()
or "uniqueviolationerror" in str(e).lower() or "uniqueviolationerror" in str(e).lower()
): ):
logger.warning( logger.warning(
f"Duplicate content_hash detected during final commit. " f"Duplicate content_hash detected during final commit. "
f"This may occur if the same message was indexed by multiple connectors. "
f"Rolling back and continuing. Error: {e!s}" f"Rolling back and continuing. Error: {e!s}"
) )
await session.rollback() await session.rollback()
# Don't fail the entire task - some documents may have been successfully indexed
else: else:
raise raise
# Build warning message if there were issues
warning_parts = [] warning_parts = []
if duplicate_content_count > 0: if duplicate_content_count > 0:
warning_parts.append(f"{duplicate_content_count} duplicate") warning_parts.append(f"{duplicate_content_count} duplicate")
@ -555,7 +415,6 @@ async def index_google_gmail_messages(
total_processed = documents_indexed total_processed = documents_indexed
# Log success
await task_logger.log_task_success( await task_logger.log_task_success(
log_entry, log_entry,
f"Successfully completed Google Gmail indexing for connector {connector_id}", f"Successfully completed Google Gmail indexing for connector {connector_id}",

View file

@ -3,6 +3,7 @@ import pytest
from app.db import DocumentType from app.db import DocumentType
from app.indexing_pipeline.document_hashing import ( from app.indexing_pipeline.document_hashing import (
compute_content_hash, compute_content_hash,
compute_identifier_hash,
compute_unique_identifier_hash, compute_unique_identifier_hash,
) )
@ -61,3 +62,23 @@ def test_different_content_produces_different_content_hash(make_connector_docume
doc_a = make_connector_document(source_markdown="Original content") doc_a = make_connector_document(source_markdown="Original content")
doc_b = make_connector_document(source_markdown="Updated content") doc_b = make_connector_document(source_markdown="Updated content")
assert compute_content_hash(doc_a) != compute_content_hash(doc_b) assert compute_content_hash(doc_a) != compute_content_hash(doc_b)
def test_compute_identifier_hash_matches_connector_doc_hash(make_connector_document):
"""Raw-args hash equals ConnectorDocument hash for equivalent inputs."""
doc = make_connector_document(
document_type=DocumentType.GOOGLE_GMAIL_CONNECTOR,
unique_id="msg-123",
search_space_id=5,
)
raw_hash = compute_identifier_hash("GOOGLE_GMAIL_CONNECTOR", "msg-123", 5)
assert raw_hash == compute_unique_identifier_hash(doc)
def test_compute_identifier_hash_differs_for_different_inputs():
"""Different arguments produce different hashes."""
h1 = compute_identifier_hash("GOOGLE_DRIVE_FILE", "file-1", 1)
h2 = compute_identifier_hash("GOOGLE_DRIVE_FILE", "file-2", 1)
h3 = compute_identifier_hash("GOOGLE_DRIVE_FILE", "file-1", 2)
h4 = compute_identifier_hash("COMPOSIO_GOOGLE_DRIVE_CONNECTOR", "file-1", 1)
assert len({h1, h2, h3, h4}) == 4