mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-05-27 19:25:15 +02:00
feat: enhance Google connectors indexing with content extraction and document migration
- Added `download_and_extract_content` function to extract content from Google Drive files as markdown. - Updated Google Drive indexer to utilize the new content extraction method. - Implemented document migration logic to update legacy Composio document types to their native Google types. - Introduced identifier hashing for stable document identification. - Improved file pre-filtering to handle unchanged and rename-only files efficiently.
This commit is contained in:
parent
2da6fd89ea
commit
f7b52470eb
8 changed files with 951 additions and 1588 deletions
|
|
@ -2,13 +2,14 @@
|
||||||
|
|
||||||
from .change_tracker import categorize_change, fetch_all_changes, get_start_page_token
|
from .change_tracker import categorize_change, fetch_all_changes, get_start_page_token
|
||||||
from .client import GoogleDriveClient
|
from .client import GoogleDriveClient
|
||||||
from .content_extractor import download_and_process_file
|
from .content_extractor import download_and_extract_content, download_and_process_file
|
||||||
from .credentials import get_valid_credentials, validate_credentials
|
from .credentials import get_valid_credentials, validate_credentials
|
||||||
from .folder_manager import get_file_by_id, get_files_in_folder, list_folder_contents
|
from .folder_manager import get_file_by_id, get_files_in_folder, list_folder_contents
|
||||||
|
|
||||||
__all__ = [
|
__all__ = [
|
||||||
"GoogleDriveClient",
|
"GoogleDriveClient",
|
||||||
"categorize_change",
|
"categorize_change",
|
||||||
|
"download_and_extract_content",
|
||||||
"download_and_process_file",
|
"download_and_process_file",
|
||||||
"fetch_all_changes",
|
"fetch_all_changes",
|
||||||
"get_file_by_id",
|
"get_file_by_id",
|
||||||
|
|
|
||||||
|
|
@ -17,6 +17,160 @@ from .file_types import get_export_mime_type, is_google_workspace_file, should_s
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
async def download_and_extract_content(
|
||||||
|
client: GoogleDriveClient,
|
||||||
|
file: dict[str, Any],
|
||||||
|
) -> tuple[str | None, dict[str, Any], str | None]:
|
||||||
|
"""Download a Google Drive file and extract its content as markdown.
|
||||||
|
|
||||||
|
ETL only -- no DB writes, no indexing, no summarization.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
(markdown_content, drive_metadata, error_message)
|
||||||
|
On success error_message is None.
|
||||||
|
"""
|
||||||
|
file_id = file.get("id")
|
||||||
|
file_name = file.get("name", "Unknown")
|
||||||
|
mime_type = file.get("mimeType", "")
|
||||||
|
|
||||||
|
if should_skip_file(mime_type):
|
||||||
|
return None, {}, f"Skipping {mime_type}"
|
||||||
|
|
||||||
|
logger.info(f"Downloading file for content extraction: {file_name} ({mime_type})")
|
||||||
|
|
||||||
|
drive_metadata: dict[str, Any] = {
|
||||||
|
"google_drive_file_id": file_id,
|
||||||
|
"google_drive_file_name": file_name,
|
||||||
|
"google_drive_mime_type": mime_type,
|
||||||
|
"source_connector": "google_drive",
|
||||||
|
}
|
||||||
|
if "modifiedTime" in file:
|
||||||
|
drive_metadata["modified_time"] = file["modifiedTime"]
|
||||||
|
if "createdTime" in file:
|
||||||
|
drive_metadata["created_time"] = file["createdTime"]
|
||||||
|
if "size" in file:
|
||||||
|
drive_metadata["file_size"] = file["size"]
|
||||||
|
if "webViewLink" in file:
|
||||||
|
drive_metadata["web_view_link"] = file["webViewLink"]
|
||||||
|
if "md5Checksum" in file:
|
||||||
|
drive_metadata["md5_checksum"] = file["md5Checksum"]
|
||||||
|
if is_google_workspace_file(mime_type):
|
||||||
|
drive_metadata["exported_as"] = "pdf"
|
||||||
|
drive_metadata["original_workspace_type"] = mime_type.split(".")[-1]
|
||||||
|
|
||||||
|
temp_file_path = None
|
||||||
|
try:
|
||||||
|
# Download / export
|
||||||
|
if is_google_workspace_file(mime_type):
|
||||||
|
export_mime = get_export_mime_type(mime_type)
|
||||||
|
if not export_mime:
|
||||||
|
return None, drive_metadata, f"Cannot export Google Workspace type: {mime_type}"
|
||||||
|
content_bytes, error = await client.export_google_file(file_id, export_mime)
|
||||||
|
if error:
|
||||||
|
return None, drive_metadata, error
|
||||||
|
extension = ".pdf" if export_mime == "application/pdf" else ".txt"
|
||||||
|
else:
|
||||||
|
content_bytes, error = await client.download_file(file_id)
|
||||||
|
if error:
|
||||||
|
return None, drive_metadata, error
|
||||||
|
extension = Path(file_name).suffix or ".bin"
|
||||||
|
|
||||||
|
with tempfile.NamedTemporaryFile(delete=False, suffix=extension) as tmp:
|
||||||
|
tmp.write(content_bytes)
|
||||||
|
temp_file_path = tmp.name
|
||||||
|
|
||||||
|
# Parse to markdown
|
||||||
|
markdown = await _parse_file_to_markdown(temp_file_path, file_name)
|
||||||
|
return markdown, drive_metadata, None
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Failed to extract content from {file_name}: {e!s}")
|
||||||
|
return None, drive_metadata, str(e)
|
||||||
|
finally:
|
||||||
|
if temp_file_path and os.path.exists(temp_file_path):
|
||||||
|
try:
|
||||||
|
os.unlink(temp_file_path)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
async def _parse_file_to_markdown(file_path: str, filename: str) -> str:
|
||||||
|
"""Parse a local file to markdown using the configured ETL service."""
|
||||||
|
lower = filename.lower()
|
||||||
|
|
||||||
|
if lower.endswith((".md", ".markdown", ".txt")):
|
||||||
|
with open(file_path, encoding="utf-8") as f:
|
||||||
|
return f.read()
|
||||||
|
|
||||||
|
if lower.endswith((".mp3", ".mp4", ".mpeg", ".mpga", ".m4a", ".wav", ".webm")):
|
||||||
|
from app.config import config as app_config
|
||||||
|
from litellm import atranscription
|
||||||
|
|
||||||
|
stt_service_type = (
|
||||||
|
"local"
|
||||||
|
if app_config.STT_SERVICE and app_config.STT_SERVICE.startswith("local/")
|
||||||
|
else "external"
|
||||||
|
)
|
||||||
|
if stt_service_type == "local":
|
||||||
|
from app.services.stt_service import stt_service
|
||||||
|
result = stt_service.transcribe_file(file_path)
|
||||||
|
text = result.get("text", "")
|
||||||
|
else:
|
||||||
|
with open(file_path, "rb") as audio_file:
|
||||||
|
kwargs: dict[str, Any] = {
|
||||||
|
"model": app_config.STT_SERVICE,
|
||||||
|
"file": audio_file,
|
||||||
|
"api_key": app_config.STT_SERVICE_API_KEY,
|
||||||
|
}
|
||||||
|
if app_config.STT_SERVICE_API_BASE:
|
||||||
|
kwargs["api_base"] = app_config.STT_SERVICE_API_BASE
|
||||||
|
resp = await atranscription(**kwargs)
|
||||||
|
text = resp.get("text", "")
|
||||||
|
|
||||||
|
if not text:
|
||||||
|
raise ValueError("Transcription returned empty text")
|
||||||
|
return f"# Transcription of {filename}\n\n{text}"
|
||||||
|
|
||||||
|
# Document files -- use configured ETL service
|
||||||
|
from app.config import config as app_config
|
||||||
|
|
||||||
|
if app_config.ETL_SERVICE == "UNSTRUCTURED":
|
||||||
|
from langchain_unstructured import UnstructuredLoader
|
||||||
|
from app.utils.document_converters import convert_document_to_markdown
|
||||||
|
|
||||||
|
loader = UnstructuredLoader(
|
||||||
|
file_path,
|
||||||
|
mode="elements",
|
||||||
|
post_processors=[],
|
||||||
|
languages=["eng"],
|
||||||
|
include_orig_elements=False,
|
||||||
|
include_metadata=False,
|
||||||
|
strategy="auto",
|
||||||
|
)
|
||||||
|
docs = await loader.aload()
|
||||||
|
return await convert_document_to_markdown(docs)
|
||||||
|
|
||||||
|
if app_config.ETL_SERVICE == "LLAMACLOUD":
|
||||||
|
from app.tasks.document_processors.file_processors import (
|
||||||
|
parse_with_llamacloud_retry,
|
||||||
|
)
|
||||||
|
|
||||||
|
result = await parse_with_llamacloud_retry(file_path=file_path, estimated_pages=50)
|
||||||
|
markdown_documents = await result.aget_markdown_documents(split_by_page=False)
|
||||||
|
if not markdown_documents:
|
||||||
|
raise RuntimeError(f"LlamaCloud returned no documents for {filename}")
|
||||||
|
return markdown_documents[0].text
|
||||||
|
|
||||||
|
if app_config.ETL_SERVICE == "DOCLING":
|
||||||
|
from docling.document_converter import DocumentConverter
|
||||||
|
|
||||||
|
converter = DocumentConverter()
|
||||||
|
result = converter.convert(file_path)
|
||||||
|
return result.document.export_to_markdown()
|
||||||
|
|
||||||
|
raise RuntimeError(f"Unknown ETL_SERVICE: {app_config.ETL_SERVICE}")
|
||||||
|
|
||||||
|
|
||||||
async def download_and_process_file(
|
async def download_and_process_file(
|
||||||
client: GoogleDriveClient,
|
client: GoogleDriveClient,
|
||||||
file: dict[str, Any],
|
file: dict[str, Any],
|
||||||
|
|
|
||||||
|
|
@ -3,10 +3,17 @@ import hashlib
|
||||||
from app.indexing_pipeline.connector_document import ConnectorDocument
|
from app.indexing_pipeline.connector_document import ConnectorDocument
|
||||||
|
|
||||||
|
|
||||||
|
def compute_identifier_hash(
|
||||||
|
document_type_value: str, unique_id: str, search_space_id: int
|
||||||
|
) -> str:
|
||||||
|
"""Return a stable SHA-256 hash from raw identity components."""
|
||||||
|
combined = f"{document_type_value}:{unique_id}:{search_space_id}"
|
||||||
|
return hashlib.sha256(combined.encode("utf-8")).hexdigest()
|
||||||
|
|
||||||
|
|
||||||
def compute_unique_identifier_hash(doc: ConnectorDocument) -> str:
|
def compute_unique_identifier_hash(doc: ConnectorDocument) -> str:
|
||||||
"""Return a stable SHA-256 hash identifying a document by its source identity."""
|
"""Return a stable SHA-256 hash identifying a document by its source identity."""
|
||||||
combined = f"{doc.document_type.value}:{doc.unique_id}:{doc.search_space_id}"
|
return compute_identifier_hash(doc.document_type.value, doc.unique_id, doc.search_space_id)
|
||||||
return hashlib.sha256(combined.encode("utf-8")).hexdigest()
|
|
||||||
|
|
||||||
|
|
||||||
def compute_content_hash(doc: ConnectorDocument) -> str:
|
def compute_content_hash(doc: ConnectorDocument) -> str:
|
||||||
|
|
|
||||||
|
|
@ -6,12 +6,13 @@ from sqlalchemy import delete, select
|
||||||
from sqlalchemy.exc import IntegrityError
|
from sqlalchemy.exc import IntegrityError
|
||||||
from sqlalchemy.ext.asyncio import AsyncSession
|
from sqlalchemy.ext.asyncio import AsyncSession
|
||||||
|
|
||||||
from app.db import Chunk, Document, DocumentStatus
|
from app.db import NATIVE_TO_LEGACY_DOCTYPE, Chunk, Document, DocumentStatus
|
||||||
from app.indexing_pipeline.connector_document import ConnectorDocument
|
from app.indexing_pipeline.connector_document import ConnectorDocument
|
||||||
from app.indexing_pipeline.document_chunker import chunk_text
|
from app.indexing_pipeline.document_chunker import chunk_text
|
||||||
from app.indexing_pipeline.document_embedder import embed_texts
|
from app.indexing_pipeline.document_embedder import embed_texts
|
||||||
from app.indexing_pipeline.document_hashing import (
|
from app.indexing_pipeline.document_hashing import (
|
||||||
compute_content_hash,
|
compute_content_hash,
|
||||||
|
compute_identifier_hash,
|
||||||
compute_unique_identifier_hash,
|
compute_unique_identifier_hash,
|
||||||
)
|
)
|
||||||
from app.indexing_pipeline.document_persistence import (
|
from app.indexing_pipeline.document_persistence import (
|
||||||
|
|
@ -54,6 +55,62 @@ class IndexingPipelineService:
|
||||||
def __init__(self, session: AsyncSession) -> None:
|
def __init__(self, session: AsyncSession) -> None:
|
||||||
self.session = session
|
self.session = session
|
||||||
|
|
||||||
|
async def migrate_legacy_docs(
|
||||||
|
self, connector_docs: list[ConnectorDocument]
|
||||||
|
) -> None:
|
||||||
|
"""Migrate legacy Composio documents to their native Google type.
|
||||||
|
|
||||||
|
For each ConnectorDocument whose document_type has a Composio equivalent
|
||||||
|
in NATIVE_TO_LEGACY_DOCTYPE, look up the old document by legacy hash and
|
||||||
|
update its unique_identifier_hash and document_type so that
|
||||||
|
prepare_for_indexing() can find it under the native hash.
|
||||||
|
"""
|
||||||
|
for doc in connector_docs:
|
||||||
|
legacy_type = NATIVE_TO_LEGACY_DOCTYPE.get(doc.document_type.value)
|
||||||
|
if not legacy_type:
|
||||||
|
continue
|
||||||
|
|
||||||
|
legacy_hash = compute_identifier_hash(
|
||||||
|
legacy_type, doc.unique_id, doc.search_space_id
|
||||||
|
)
|
||||||
|
result = await self.session.execute(
|
||||||
|
select(Document).filter(
|
||||||
|
Document.unique_identifier_hash == legacy_hash
|
||||||
|
)
|
||||||
|
)
|
||||||
|
existing = result.scalars().first()
|
||||||
|
if existing is None:
|
||||||
|
continue
|
||||||
|
|
||||||
|
native_hash = compute_identifier_hash(
|
||||||
|
doc.document_type.value, doc.unique_id, doc.search_space_id
|
||||||
|
)
|
||||||
|
existing.unique_identifier_hash = native_hash
|
||||||
|
existing.document_type = doc.document_type
|
||||||
|
|
||||||
|
await self.session.commit()
|
||||||
|
|
||||||
|
async def index_batch(
|
||||||
|
self, connector_docs: list[ConnectorDocument], llm
|
||||||
|
) -> list[Document]:
|
||||||
|
"""Convenience method: prepare_for_indexing then index each document.
|
||||||
|
|
||||||
|
Indexers that need heartbeat callbacks or custom per-document logic
|
||||||
|
should call prepare_for_indexing() + index() directly instead.
|
||||||
|
"""
|
||||||
|
doc_map = {
|
||||||
|
compute_unique_identifier_hash(cd): cd for cd in connector_docs
|
||||||
|
}
|
||||||
|
documents = await self.prepare_for_indexing(connector_docs)
|
||||||
|
results: list[Document] = []
|
||||||
|
for document in documents:
|
||||||
|
connector_doc = doc_map.get(document.unique_identifier_hash)
|
||||||
|
if connector_doc is None:
|
||||||
|
continue
|
||||||
|
result = await self.index(document, connector_doc, llm)
|
||||||
|
results.append(result)
|
||||||
|
return results
|
||||||
|
|
||||||
async def prepare_for_indexing(
|
async def prepare_for_indexing(
|
||||||
self, connector_docs: list[ConnectorDocument]
|
self, connector_docs: list[ConnectorDocument]
|
||||||
) -> list[Document]:
|
) -> list[Document]:
|
||||||
|
|
|
||||||
|
|
@ -1,9 +1,8 @@
|
||||||
"""
|
"""
|
||||||
Google Calendar connector indexer.
|
Google Calendar connector indexer.
|
||||||
|
|
||||||
Implements 2-phase document status updates for real-time UI feedback:
|
Uses the shared IndexingPipelineService for document deduplication,
|
||||||
- Phase 1: Create all documents with 'pending' status (visible in UI immediately)
|
summarization, chunking, and embedding.
|
||||||
- Phase 2: Process each document: pending → processing → ready/failed
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import time
|
import time
|
||||||
|
|
@ -15,29 +14,25 @@ from sqlalchemy.exc import SQLAlchemyError
|
||||||
from sqlalchemy.ext.asyncio import AsyncSession
|
from sqlalchemy.ext.asyncio import AsyncSession
|
||||||
|
|
||||||
from app.connectors.google_calendar_connector import GoogleCalendarConnector
|
from app.connectors.google_calendar_connector import GoogleCalendarConnector
|
||||||
from app.db import Document, DocumentStatus, DocumentType, SearchSourceConnectorType
|
from app.db import DocumentType, SearchSourceConnectorType
|
||||||
|
from app.indexing_pipeline.connector_document import ConnectorDocument
|
||||||
|
from app.indexing_pipeline.document_hashing import (
|
||||||
|
compute_content_hash,
|
||||||
|
compute_unique_identifier_hash,
|
||||||
|
)
|
||||||
|
from app.indexing_pipeline.indexing_pipeline_service import IndexingPipelineService
|
||||||
from app.services.llm_service import get_user_long_context_llm
|
from app.services.llm_service import get_user_long_context_llm
|
||||||
from app.services.task_logging_service import TaskLoggingService
|
from app.services.task_logging_service import TaskLoggingService
|
||||||
from app.utils.document_converters import (
|
|
||||||
create_document_chunks,
|
|
||||||
embed_text,
|
|
||||||
generate_content_hash,
|
|
||||||
generate_document_summary,
|
|
||||||
generate_unique_identifier_hash,
|
|
||||||
)
|
|
||||||
from app.utils.google_credentials import (
|
from app.utils.google_credentials import (
|
||||||
COMPOSIO_GOOGLE_CONNECTOR_TYPES,
|
COMPOSIO_GOOGLE_CONNECTOR_TYPES,
|
||||||
build_composio_credentials,
|
build_composio_credentials,
|
||||||
)
|
)
|
||||||
|
|
||||||
from .base import (
|
from .base import (
|
||||||
check_document_by_unique_identifier,
|
|
||||||
check_duplicate_document_by_hash,
|
check_duplicate_document_by_hash,
|
||||||
get_connector_by_id,
|
get_connector_by_id,
|
||||||
get_current_timestamp,
|
|
||||||
logger,
|
logger,
|
||||||
parse_date_flexible,
|
parse_date_flexible,
|
||||||
safe_set_chunks,
|
|
||||||
update_connector_last_indexed,
|
update_connector_last_indexed,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
@ -46,13 +41,60 @@ ACCEPTED_CALENDAR_CONNECTOR_TYPES = {
|
||||||
SearchSourceConnectorType.COMPOSIO_GOOGLE_CALENDAR_CONNECTOR,
|
SearchSourceConnectorType.COMPOSIO_GOOGLE_CALENDAR_CONNECTOR,
|
||||||
}
|
}
|
||||||
|
|
||||||
# Type hint for heartbeat callback
|
|
||||||
HeartbeatCallbackType = Callable[[int], Awaitable[None]]
|
HeartbeatCallbackType = Callable[[int], Awaitable[None]]
|
||||||
|
|
||||||
# Heartbeat interval in seconds
|
|
||||||
HEARTBEAT_INTERVAL_SECONDS = 30
|
HEARTBEAT_INTERVAL_SECONDS = 30
|
||||||
|
|
||||||
|
|
||||||
|
def _build_connector_doc(
|
||||||
|
event: dict,
|
||||||
|
event_markdown: str,
|
||||||
|
*,
|
||||||
|
connector_id: int,
|
||||||
|
search_space_id: int,
|
||||||
|
user_id: str,
|
||||||
|
enable_summary: bool,
|
||||||
|
) -> ConnectorDocument:
|
||||||
|
"""Map a raw Google Calendar API event dict to a ConnectorDocument."""
|
||||||
|
event_id = event.get("id", "")
|
||||||
|
event_summary = event.get("summary", "No Title")
|
||||||
|
calendar_id = event.get("calendarId", "")
|
||||||
|
|
||||||
|
start = event.get("start", {})
|
||||||
|
end = event.get("end", {})
|
||||||
|
start_time = start.get("dateTime") or start.get("date", "")
|
||||||
|
end_time = end.get("dateTime") or end.get("date", "")
|
||||||
|
location = event.get("location", "")
|
||||||
|
|
||||||
|
metadata = {
|
||||||
|
"event_id": event_id,
|
||||||
|
"event_summary": event_summary,
|
||||||
|
"calendar_id": calendar_id,
|
||||||
|
"start_time": start_time,
|
||||||
|
"end_time": end_time,
|
||||||
|
"location": location,
|
||||||
|
"connector_id": connector_id,
|
||||||
|
"document_type": "Google Calendar Event",
|
||||||
|
"connector_type": "Google Calendar",
|
||||||
|
}
|
||||||
|
|
||||||
|
fallback_summary = (
|
||||||
|
f"Google Calendar Event: {event_summary}\n\n{event_markdown}"
|
||||||
|
)
|
||||||
|
|
||||||
|
return ConnectorDocument(
|
||||||
|
title=event_summary,
|
||||||
|
source_markdown=event_markdown,
|
||||||
|
unique_id=event_id,
|
||||||
|
document_type=DocumentType.GOOGLE_CALENDAR_CONNECTOR,
|
||||||
|
search_space_id=search_space_id,
|
||||||
|
connector_id=connector_id,
|
||||||
|
created_by_id=user_id,
|
||||||
|
should_summarize=enable_summary,
|
||||||
|
fallback_summary=fallback_summary,
|
||||||
|
metadata=metadata,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
async def index_google_calendar_events(
|
async def index_google_calendar_events(
|
||||||
session: AsyncSession,
|
session: AsyncSession,
|
||||||
connector_id: int,
|
connector_id: int,
|
||||||
|
|
@ -82,7 +124,6 @@ async def index_google_calendar_events(
|
||||||
"""
|
"""
|
||||||
task_logger = TaskLoggingService(session, search_space_id)
|
task_logger = TaskLoggingService(session, search_space_id)
|
||||||
|
|
||||||
# Log task start
|
|
||||||
log_entry = await task_logger.log_task_start(
|
log_entry = await task_logger.log_task_start(
|
||||||
task_name="google_calendar_events_indexing",
|
task_name="google_calendar_events_indexing",
|
||||||
source="connector_indexing_task",
|
source="connector_indexing_task",
|
||||||
|
|
@ -96,7 +137,7 @@ async def index_google_calendar_events(
|
||||||
)
|
)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Accept both native and Composio Calendar connectors
|
# ── Connector lookup ──────────────────────────────────────────
|
||||||
connector = None
|
connector = None
|
||||||
for ct in ACCEPTED_CALENDAR_CONNECTOR_TYPES:
|
for ct in ACCEPTED_CALENDAR_CONNECTOR_TYPES:
|
||||||
connector = await get_connector_by_id(session, connector_id, ct)
|
connector = await get_connector_by_id(session, connector_id, ct)
|
||||||
|
|
@ -112,7 +153,7 @@ async def index_google_calendar_events(
|
||||||
)
|
)
|
||||||
return 0, 0, f"Connector with ID {connector_id} not found"
|
return 0, 0, f"Connector with ID {connector_id} not found"
|
||||||
|
|
||||||
# Build credentials based on connector type
|
# ── Credential building ───────────────────────────────────────
|
||||||
if connector.connector_type in COMPOSIO_GOOGLE_CONNECTOR_TYPES:
|
if connector.connector_type in COMPOSIO_GOOGLE_CONNECTOR_TYPES:
|
||||||
connected_account_id = connector.config.get("composio_connected_account_id")
|
connected_account_id = connector.config.get("composio_connected_account_id")
|
||||||
if not connected_account_id:
|
if not connected_account_id:
|
||||||
|
|
@ -184,6 +225,7 @@ async def index_google_calendar_events(
|
||||||
)
|
)
|
||||||
return 0, 0, "Google Calendar credentials not found in connector config"
|
return 0, 0, "Google Calendar credentials not found in connector config"
|
||||||
|
|
||||||
|
# ── Calendar client init ──────────────────────────────────────
|
||||||
await task_logger.log_task_progress(
|
await task_logger.log_task_progress(
|
||||||
log_entry,
|
log_entry,
|
||||||
f"Initializing Google Calendar client for connector {connector_id}",
|
f"Initializing Google Calendar client for connector {connector_id}",
|
||||||
|
|
@ -203,36 +245,26 @@ async def index_google_calendar_events(
|
||||||
if end_date == "undefined" or end_date == "":
|
if end_date == "undefined" or end_date == "":
|
||||||
end_date = None
|
end_date = None
|
||||||
|
|
||||||
# Calculate date range
|
# ── Date range calculation ────────────────────────────────────
|
||||||
# For calendar connectors, allow future dates to index upcoming events
|
|
||||||
if start_date is None or end_date is None:
|
if start_date is None or end_date is None:
|
||||||
# Fall back to calculating dates based on last_indexed_at
|
|
||||||
# Default to today (users can manually select future dates if needed)
|
|
||||||
calculated_end_date = datetime.now()
|
calculated_end_date = datetime.now()
|
||||||
|
|
||||||
# Use last_indexed_at as start date if available, otherwise use 30 days ago
|
|
||||||
if connector.last_indexed_at:
|
if connector.last_indexed_at:
|
||||||
# Convert dates to be comparable (both timezone-naive)
|
|
||||||
last_indexed_naive = (
|
last_indexed_naive = (
|
||||||
connector.last_indexed_at.replace(tzinfo=None)
|
connector.last_indexed_at.replace(tzinfo=None)
|
||||||
if connector.last_indexed_at.tzinfo
|
if connector.last_indexed_at.tzinfo
|
||||||
else connector.last_indexed_at
|
else connector.last_indexed_at
|
||||||
)
|
)
|
||||||
|
|
||||||
# Allow future dates - use last_indexed_at as start date
|
|
||||||
calculated_start_date = last_indexed_naive
|
calculated_start_date = last_indexed_naive
|
||||||
logger.info(
|
logger.info(
|
||||||
f"Using last_indexed_at ({calculated_start_date.strftime('%Y-%m-%d')}) as start date"
|
f"Using last_indexed_at ({calculated_start_date.strftime('%Y-%m-%d')}) as start date"
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
calculated_start_date = datetime.now() - timedelta(
|
calculated_start_date = datetime.now() - timedelta(days=365)
|
||||||
days=365
|
|
||||||
) # Use 365 days as default for calendar events (matches frontend)
|
|
||||||
logger.info(
|
logger.info(
|
||||||
f"No last_indexed_at found, using {calculated_start_date.strftime('%Y-%m-%d')} (365 days ago) as start date"
|
f"No last_indexed_at found, using {calculated_start_date.strftime('%Y-%m-%d')} (365 days ago) as start date"
|
||||||
)
|
)
|
||||||
|
|
||||||
# Use calculated dates if not provided
|
|
||||||
start_date_str = (
|
start_date_str = (
|
||||||
start_date if start_date else calculated_start_date.strftime("%Y-%m-%d")
|
start_date if start_date else calculated_start_date.strftime("%Y-%m-%d")
|
||||||
)
|
)
|
||||||
|
|
@ -240,19 +272,14 @@ async def index_google_calendar_events(
|
||||||
end_date if end_date else calculated_end_date.strftime("%Y-%m-%d")
|
end_date if end_date else calculated_end_date.strftime("%Y-%m-%d")
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
# Use provided dates (including future dates)
|
|
||||||
start_date_str = start_date
|
start_date_str = start_date
|
||||||
end_date_str = end_date
|
end_date_str = end_date
|
||||||
|
|
||||||
# FIX: Ensure end_date is at least 1 day after start_date to avoid
|
|
||||||
# "start_date must be strictly before end_date" errors when dates are the same
|
|
||||||
# (e.g., when last_indexed_at is today)
|
|
||||||
if start_date_str == end_date_str:
|
if start_date_str == end_date_str:
|
||||||
logger.info(
|
logger.info(
|
||||||
f"Start date ({start_date_str}) equals end date ({end_date_str}), "
|
f"Start date ({start_date_str}) equals end date ({end_date_str}), "
|
||||||
"adjusting end date to next day to ensure valid date range"
|
"adjusting end date to next day to ensure valid date range"
|
||||||
)
|
)
|
||||||
# Parse end_date and add 1 day
|
|
||||||
try:
|
try:
|
||||||
end_dt = parse_date_flexible(end_date_str)
|
end_dt = parse_date_flexible(end_date_str)
|
||||||
except ValueError:
|
except ValueError:
|
||||||
|
|
@ -264,6 +291,7 @@ async def index_google_calendar_events(
|
||||||
end_date_str = end_dt.strftime("%Y-%m-%d")
|
end_date_str = end_dt.strftime("%Y-%m-%d")
|
||||||
logger.info(f"Adjusted end date to {end_date_str}")
|
logger.info(f"Adjusted end date to {end_date_str}")
|
||||||
|
|
||||||
|
# ── Fetch events ──────────────────────────────────────────────
|
||||||
await task_logger.log_task_progress(
|
await task_logger.log_task_progress(
|
||||||
log_entry,
|
log_entry,
|
||||||
f"Fetching Google Calendar events from {start_date_str} to {end_date_str}",
|
f"Fetching Google Calendar events from {start_date_str} to {end_date_str}",
|
||||||
|
|
@ -274,27 +302,19 @@ async def index_google_calendar_events(
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
|
|
||||||
# Get events within date range from primary calendar
|
|
||||||
try:
|
try:
|
||||||
events, error = await calendar_client.get_all_primary_calendar_events(
|
events, error = await calendar_client.get_all_primary_calendar_events(
|
||||||
start_date=start_date_str, end_date=end_date_str
|
start_date=start_date_str, end_date=end_date_str
|
||||||
)
|
)
|
||||||
|
|
||||||
if error:
|
if error:
|
||||||
# Don't treat "No events found" as an error that should stop indexing
|
|
||||||
if "No events found" in error:
|
if "No events found" in error:
|
||||||
logger.info(f"No Google Calendar events found: {error}")
|
logger.info(f"No Google Calendar events found: {error}")
|
||||||
logger.info(
|
|
||||||
"No events found is not a critical error, continuing with update"
|
|
||||||
)
|
|
||||||
if update_last_indexed:
|
if update_last_indexed:
|
||||||
await update_connector_last_indexed(
|
await update_connector_last_indexed(
|
||||||
session, connector, update_last_indexed
|
session, connector, update_last_indexed
|
||||||
)
|
)
|
||||||
await session.commit()
|
await session.commit()
|
||||||
logger.info(
|
|
||||||
f"Updated last_indexed_at to {connector.last_indexed_at} despite no events found"
|
|
||||||
)
|
|
||||||
|
|
||||||
await task_logger.log_task_success(
|
await task_logger.log_task_success(
|
||||||
log_entry,
|
log_entry,
|
||||||
|
|
@ -304,7 +324,6 @@ async def index_google_calendar_events(
|
||||||
return 0, 0, None
|
return 0, 0, None
|
||||||
else:
|
else:
|
||||||
logger.error(f"Failed to get Google Calendar events: {error}")
|
logger.error(f"Failed to get Google Calendar events: {error}")
|
||||||
# Check if this is an authentication error that requires re-authentication
|
|
||||||
error_message = error
|
error_message = error
|
||||||
error_type = "APIError"
|
error_type = "APIError"
|
||||||
if (
|
if (
|
||||||
|
|
@ -329,28 +348,15 @@ async def index_google_calendar_events(
|
||||||
logger.error(f"Error fetching Google Calendar events: {e!s}", exc_info=True)
|
logger.error(f"Error fetching Google Calendar events: {e!s}", exc_info=True)
|
||||||
return 0, 0, f"Error fetching Google Calendar events: {e!s}"
|
return 0, 0, f"Error fetching Google Calendar events: {e!s}"
|
||||||
|
|
||||||
documents_indexed = 0
|
# ── Build ConnectorDocuments ──────────────────────────────────
|
||||||
|
connector_docs: list[ConnectorDocument] = []
|
||||||
documents_skipped = 0
|
documents_skipped = 0
|
||||||
documents_failed = 0 # Track events that failed processing
|
duplicate_content_count = 0
|
||||||
duplicate_content_count = (
|
|
||||||
0 # Track events skipped due to duplicate content_hash
|
|
||||||
)
|
|
||||||
|
|
||||||
# Heartbeat tracking - update notification periodically to prevent appearing stuck
|
|
||||||
last_heartbeat_time = time.time()
|
|
||||||
|
|
||||||
# =======================================================================
|
|
||||||
# PHASE 1: Analyze all events, create pending documents
|
|
||||||
# This makes ALL documents visible in the UI immediately with pending status
|
|
||||||
# =======================================================================
|
|
||||||
events_to_process = [] # List of dicts with document and event data
|
|
||||||
new_documents_created = False
|
|
||||||
|
|
||||||
for event in events:
|
for event in events:
|
||||||
try:
|
try:
|
||||||
event_id = event.get("id")
|
event_id = event.get("id")
|
||||||
event_summary = event.get("summary", "No Title")
|
event_summary = event.get("summary", "No Title")
|
||||||
calendar_id = event.get("calendarId", "")
|
|
||||||
|
|
||||||
if not event_id:
|
if not event_id:
|
||||||
logger.warning(f"Skipping event with missing ID: {event_summary}")
|
logger.warning(f"Skipping event with missing ID: {event_summary}")
|
||||||
|
|
@ -363,223 +369,73 @@ async def index_google_calendar_events(
|
||||||
documents_skipped += 1
|
documents_skipped += 1
|
||||||
continue
|
continue
|
||||||
|
|
||||||
start = event.get("start", {})
|
doc = _build_connector_doc(
|
||||||
end = event.get("end", {})
|
event,
|
||||||
start_time = start.get("dateTime") or start.get("date", "")
|
event_markdown,
|
||||||
end_time = end.get("dateTime") or end.get("date", "")
|
connector_id=connector_id,
|
||||||
location = event.get("location", "")
|
search_space_id=search_space_id,
|
||||||
description = event.get("description", "")
|
user_id=user_id,
|
||||||
|
enable_summary=connector.enable_summary,
|
||||||
# Generate unique identifier hash for this Google Calendar event
|
|
||||||
unique_identifier_hash = generate_unique_identifier_hash(
|
|
||||||
DocumentType.GOOGLE_CALENDAR_CONNECTOR, event_id, search_space_id
|
|
||||||
)
|
)
|
||||||
|
|
||||||
# Generate content hash
|
|
||||||
content_hash = generate_content_hash(event_markdown, search_space_id)
|
|
||||||
|
|
||||||
# Check if document with this unique identifier already exists
|
|
||||||
existing_document = await check_document_by_unique_identifier(
|
|
||||||
session, unique_identifier_hash
|
|
||||||
)
|
|
||||||
|
|
||||||
# Fallback: legacy Composio hash
|
|
||||||
if not existing_document:
|
|
||||||
legacy_hash = generate_unique_identifier_hash(
|
|
||||||
DocumentType.COMPOSIO_GOOGLE_CALENDAR_CONNECTOR,
|
|
||||||
event_id,
|
|
||||||
search_space_id,
|
|
||||||
)
|
|
||||||
existing_document = await check_document_by_unique_identifier(
|
|
||||||
session, legacy_hash
|
|
||||||
)
|
|
||||||
if existing_document:
|
|
||||||
existing_document.unique_identifier_hash = (
|
|
||||||
unique_identifier_hash
|
|
||||||
)
|
|
||||||
if (
|
|
||||||
existing_document.document_type
|
|
||||||
== DocumentType.COMPOSIO_GOOGLE_CALENDAR_CONNECTOR
|
|
||||||
):
|
|
||||||
existing_document.document_type = (
|
|
||||||
DocumentType.GOOGLE_CALENDAR_CONNECTOR
|
|
||||||
)
|
|
||||||
logger.info(
|
|
||||||
f"Migrated legacy Composio Calendar document: {event_id}"
|
|
||||||
)
|
|
||||||
|
|
||||||
if existing_document:
|
|
||||||
# Document exists - check if content has changed
|
|
||||||
if existing_document.content_hash == content_hash:
|
|
||||||
# Ensure status is ready (might have been stuck in processing/pending)
|
|
||||||
if not DocumentStatus.is_state(
|
|
||||||
existing_document.status, DocumentStatus.READY
|
|
||||||
):
|
|
||||||
existing_document.status = DocumentStatus.ready()
|
|
||||||
documents_skipped += 1
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Queue existing document for update (will be set to processing in Phase 2)
|
|
||||||
events_to_process.append(
|
|
||||||
{
|
|
||||||
"document": existing_document,
|
|
||||||
"is_new": False,
|
|
||||||
"event_markdown": event_markdown,
|
|
||||||
"content_hash": content_hash,
|
|
||||||
"event_id": event_id,
|
|
||||||
"event_summary": event_summary,
|
|
||||||
"calendar_id": calendar_id,
|
|
||||||
"start_time": start_time,
|
|
||||||
"end_time": end_time,
|
|
||||||
"location": location,
|
|
||||||
"description": description,
|
|
||||||
}
|
|
||||||
)
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Document doesn't exist by unique_identifier_hash
|
|
||||||
# Check if a document with the same content_hash exists (from another connector)
|
|
||||||
with session.no_autoflush:
|
with session.no_autoflush:
|
||||||
duplicate_by_content = await check_duplicate_document_by_hash(
|
duplicate = await check_duplicate_document_by_hash(
|
||||||
session, content_hash
|
session, compute_content_hash(doc)
|
||||||
)
|
)
|
||||||
|
if duplicate:
|
||||||
if duplicate_by_content:
|
|
||||||
# A document with the same content already exists (likely from Composio connector)
|
|
||||||
logger.info(
|
logger.info(
|
||||||
f"Event {event_summary} already indexed by another connector "
|
f"Event {doc.title} already indexed by another connector "
|
||||||
f"(existing document ID: {duplicate_by_content.id}, "
|
f"(existing document ID: {duplicate.id}, "
|
||||||
f"type: {duplicate_by_content.document_type}). Skipping to avoid duplicate content."
|
f"type: {duplicate.document_type}). Skipping."
|
||||||
)
|
)
|
||||||
duplicate_content_count += 1
|
duplicate_content_count += 1
|
||||||
documents_skipped += 1
|
documents_skipped += 1
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# Create new document with PENDING status (visible in UI immediately)
|
connector_docs.append(doc)
|
||||||
document = Document(
|
|
||||||
search_space_id=search_space_id,
|
|
||||||
title=event_summary,
|
|
||||||
document_type=DocumentType.GOOGLE_CALENDAR_CONNECTOR,
|
|
||||||
document_metadata={
|
|
||||||
"event_id": event_id,
|
|
||||||
"event_summary": event_summary,
|
|
||||||
"calendar_id": calendar_id,
|
|
||||||
"start_time": start_time,
|
|
||||||
"end_time": end_time,
|
|
||||||
"location": location,
|
|
||||||
"connector_id": connector_id,
|
|
||||||
},
|
|
||||||
content="Pending...", # Placeholder until processed
|
|
||||||
content_hash=unique_identifier_hash, # Temporary unique value - updated when ready
|
|
||||||
unique_identifier_hash=unique_identifier_hash,
|
|
||||||
embedding=None,
|
|
||||||
chunks=[], # Empty at creation - safe for async
|
|
||||||
status=DocumentStatus.pending(), # Pending until processing starts
|
|
||||||
updated_at=get_current_timestamp(),
|
|
||||||
created_by_id=user_id,
|
|
||||||
connector_id=connector_id,
|
|
||||||
)
|
|
||||||
session.add(document)
|
|
||||||
new_documents_created = True
|
|
||||||
|
|
||||||
events_to_process.append(
|
|
||||||
{
|
|
||||||
"document": document,
|
|
||||||
"is_new": True,
|
|
||||||
"event_markdown": event_markdown,
|
|
||||||
"content_hash": content_hash,
|
|
||||||
"event_id": event_id,
|
|
||||||
"event_summary": event_summary,
|
|
||||||
"calendar_id": calendar_id,
|
|
||||||
"start_time": start_time,
|
|
||||||
"end_time": end_time,
|
|
||||||
"location": location,
|
|
||||||
"description": description,
|
|
||||||
}
|
|
||||||
)
|
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Error in Phase 1 for event: {e!s}", exc_info=True)
|
logger.error(f"Error building ConnectorDocument for event: {e!s}", exc_info=True)
|
||||||
documents_failed += 1
|
documents_skipped += 1
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# Commit all pending documents - they all appear in UI now
|
# ── Pipeline: migrate legacy docs + prepare + index ───────────
|
||||||
if new_documents_created:
|
pipeline = IndexingPipelineService(session)
|
||||||
logger.info(
|
|
||||||
f"Phase 1: Committing {len([e for e in events_to_process if e['is_new']])} pending documents"
|
|
||||||
)
|
|
||||||
await session.commit()
|
|
||||||
|
|
||||||
# =======================================================================
|
await pipeline.migrate_legacy_docs(connector_docs)
|
||||||
# PHASE 2: Process each document one by one
|
|
||||||
# Each document transitions: pending → processing → ready/failed
|
|
||||||
# =======================================================================
|
|
||||||
logger.info(f"Phase 2: Processing {len(events_to_process)} documents")
|
|
||||||
|
|
||||||
for item in events_to_process:
|
documents = await pipeline.prepare_for_indexing(connector_docs)
|
||||||
# Send heartbeat periodically
|
|
||||||
|
doc_map = {
|
||||||
|
compute_unique_identifier_hash(cd): cd for cd in connector_docs
|
||||||
|
}
|
||||||
|
|
||||||
|
documents_indexed = 0
|
||||||
|
documents_failed = 0
|
||||||
|
last_heartbeat_time = time.time()
|
||||||
|
|
||||||
|
for document in documents:
|
||||||
if on_heartbeat_callback:
|
if on_heartbeat_callback:
|
||||||
current_time = time.time()
|
current_time = time.time()
|
||||||
if current_time - last_heartbeat_time >= HEARTBEAT_INTERVAL_SECONDS:
|
if current_time - last_heartbeat_time >= HEARTBEAT_INTERVAL_SECONDS:
|
||||||
await on_heartbeat_callback(documents_indexed)
|
await on_heartbeat_callback(documents_indexed)
|
||||||
last_heartbeat_time = current_time
|
last_heartbeat_time = current_time
|
||||||
|
|
||||||
document = item["document"]
|
connector_doc = doc_map.get(document.unique_identifier_hash)
|
||||||
try:
|
if connector_doc is None:
|
||||||
# Set to PROCESSING and commit - shows "processing" in UI for THIS document only
|
logger.warning(
|
||||||
document.status = DocumentStatus.processing()
|
f"No matching ConnectorDocument for document {document.id}, skipping"
|
||||||
await session.commit()
|
)
|
||||||
|
documents_failed += 1
|
||||||
|
continue
|
||||||
|
|
||||||
# Heavy processing (LLM, embeddings, chunks)
|
try:
|
||||||
user_llm = await get_user_long_context_llm(
|
user_llm = await get_user_long_context_llm(
|
||||||
session, user_id, search_space_id
|
session, user_id, search_space_id
|
||||||
)
|
)
|
||||||
|
await pipeline.index(document, connector_doc, user_llm)
|
||||||
if user_llm and connector.enable_summary:
|
|
||||||
document_metadata_for_summary = {
|
|
||||||
"event_id": item["event_id"],
|
|
||||||
"event_summary": item["event_summary"],
|
|
||||||
"calendar_id": item["calendar_id"],
|
|
||||||
"start_time": item["start_time"],
|
|
||||||
"end_time": item["end_time"],
|
|
||||||
"location": item["location"] or "No location",
|
|
||||||
"document_type": "Google Calendar Event",
|
|
||||||
"connector_type": "Google Calendar",
|
|
||||||
}
|
|
||||||
(
|
|
||||||
summary_content,
|
|
||||||
summary_embedding,
|
|
||||||
) = await generate_document_summary(
|
|
||||||
item["event_markdown"], user_llm, document_metadata_for_summary
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
summary_content = f"Google Calendar Event: {item['event_summary']}\n\n{item['event_markdown']}"
|
|
||||||
summary_embedding = embed_text(summary_content)
|
|
||||||
|
|
||||||
chunks = await create_document_chunks(item["event_markdown"])
|
|
||||||
|
|
||||||
# Update document to READY with actual content
|
|
||||||
document.title = item["event_summary"]
|
|
||||||
document.content = summary_content
|
|
||||||
document.content_hash = item["content_hash"]
|
|
||||||
document.embedding = summary_embedding
|
|
||||||
document.document_metadata = {
|
|
||||||
"event_id": item["event_id"],
|
|
||||||
"event_summary": item["event_summary"],
|
|
||||||
"calendar_id": item["calendar_id"],
|
|
||||||
"start_time": item["start_time"],
|
|
||||||
"end_time": item["end_time"],
|
|
||||||
"location": item["location"],
|
|
||||||
"indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
|
|
||||||
"connector_id": connector_id,
|
|
||||||
}
|
|
||||||
await safe_set_chunks(session, document, chunks)
|
|
||||||
document.updated_at = get_current_timestamp()
|
|
||||||
document.status = DocumentStatus.ready()
|
|
||||||
|
|
||||||
documents_indexed += 1
|
documents_indexed += 1
|
||||||
|
|
||||||
# Batch commit every 10 documents (for ready status updates)
|
|
||||||
if documents_indexed % 10 == 0:
|
if documents_indexed % 10 == 0:
|
||||||
logger.info(
|
logger.info(
|
||||||
f"Committing batch: {documents_indexed} Google Calendar events processed so far"
|
f"Committing batch: {documents_indexed} Google Calendar events processed so far"
|
||||||
|
|
@ -588,21 +444,12 @@ async def index_google_calendar_events(
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Error processing Calendar event: {e!s}", exc_info=True)
|
logger.error(f"Error processing Calendar event: {e!s}", exc_info=True)
|
||||||
# Mark document as failed with reason (visible in UI)
|
|
||||||
try:
|
|
||||||
document.status = DocumentStatus.failed(str(e))
|
|
||||||
document.updated_at = get_current_timestamp()
|
|
||||||
except Exception as status_error:
|
|
||||||
logger.error(
|
|
||||||
f"Failed to update document status to failed: {status_error}"
|
|
||||||
)
|
|
||||||
documents_failed += 1
|
documents_failed += 1
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# CRITICAL: Always update timestamp (even if 0 documents indexed) so Zero syncs
|
# ── Finalize ──────────────────────────────────────────────────
|
||||||
await update_connector_last_indexed(session, connector, update_last_indexed)
|
await update_connector_last_indexed(session, connector, update_last_indexed)
|
||||||
|
|
||||||
# Final commit for any remaining documents not yet committed in batches
|
|
||||||
logger.info(
|
logger.info(
|
||||||
f"Final commit: Total {documents_indexed} Google Calendar events processed"
|
f"Final commit: Total {documents_indexed} Google Calendar events processed"
|
||||||
)
|
)
|
||||||
|
|
@ -612,22 +459,18 @@ async def index_google_calendar_events(
|
||||||
"Successfully committed all Google Calendar document changes to database"
|
"Successfully committed all Google Calendar document changes to database"
|
||||||
)
|
)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
# Handle any remaining integrity errors gracefully (race conditions, etc.)
|
|
||||||
if (
|
if (
|
||||||
"duplicate key value violates unique constraint" in str(e).lower()
|
"duplicate key value violates unique constraint" in str(e).lower()
|
||||||
or "uniqueviolationerror" in str(e).lower()
|
or "uniqueviolationerror" in str(e).lower()
|
||||||
):
|
):
|
||||||
logger.warning(
|
logger.warning(
|
||||||
f"Duplicate content_hash detected during final commit. "
|
f"Duplicate content_hash detected during final commit. "
|
||||||
f"This may occur if the same event was indexed by multiple connectors. "
|
|
||||||
f"Rolling back and continuing. Error: {e!s}"
|
f"Rolling back and continuing. Error: {e!s}"
|
||||||
)
|
)
|
||||||
await session.rollback()
|
await session.rollback()
|
||||||
# Don't fail the entire task - some documents may have been successfully indexed
|
|
||||||
else:
|
else:
|
||||||
raise
|
raise
|
||||||
|
|
||||||
# Build warning message if there were issues
|
|
||||||
warning_parts = []
|
warning_parts = []
|
||||||
if duplicate_content_count > 0:
|
if duplicate_content_count > 0:
|
||||||
warning_parts.append(f"{duplicate_content_count} duplicate")
|
warning_parts.append(f"{duplicate_content_count} duplicate")
|
||||||
|
|
|
||||||
File diff suppressed because it is too large
Load diff
|
|
@ -1,11 +1,11 @@
|
||||||
"""
|
"""
|
||||||
Google Gmail connector indexer.
|
Google Gmail connector indexer.
|
||||||
|
|
||||||
Implements 2-phase document status updates for real-time UI feedback:
|
Uses the shared IndexingPipelineService for document deduplication,
|
||||||
- Phase 1: Create all documents with 'pending' status (visible in UI immediately)
|
summarization, chunking, and embedding.
|
||||||
- Phase 2: Process each document: pending → processing → ready/failed
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
import logging
|
||||||
import time
|
import time
|
||||||
from collections.abc import Awaitable, Callable
|
from collections.abc import Awaitable, Callable
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
|
@ -15,21 +15,15 @@ from sqlalchemy.exc import SQLAlchemyError
|
||||||
from sqlalchemy.ext.asyncio import AsyncSession
|
from sqlalchemy.ext.asyncio import AsyncSession
|
||||||
|
|
||||||
from app.connectors.google_gmail_connector import GoogleGmailConnector
|
from app.connectors.google_gmail_connector import GoogleGmailConnector
|
||||||
from app.db import (
|
from app.db import DocumentType, SearchSourceConnectorType
|
||||||
Document,
|
from app.indexing_pipeline.connector_document import ConnectorDocument
|
||||||
DocumentStatus,
|
from app.indexing_pipeline.document_hashing import (
|
||||||
DocumentType,
|
compute_content_hash,
|
||||||
SearchSourceConnectorType,
|
compute_unique_identifier_hash,
|
||||||
)
|
)
|
||||||
|
from app.indexing_pipeline.indexing_pipeline_service import IndexingPipelineService
|
||||||
from app.services.llm_service import get_user_long_context_llm
|
from app.services.llm_service import get_user_long_context_llm
|
||||||
from app.services.task_logging_service import TaskLoggingService
|
from app.services.task_logging_service import TaskLoggingService
|
||||||
from app.utils.document_converters import (
|
|
||||||
create_document_chunks,
|
|
||||||
embed_text,
|
|
||||||
generate_content_hash,
|
|
||||||
generate_document_summary,
|
|
||||||
generate_unique_identifier_hash,
|
|
||||||
)
|
|
||||||
from app.utils.google_credentials import (
|
from app.utils.google_credentials import (
|
||||||
COMPOSIO_GOOGLE_CONNECTOR_TYPES,
|
COMPOSIO_GOOGLE_CONNECTOR_TYPES,
|
||||||
build_composio_credentials,
|
build_composio_credentials,
|
||||||
|
|
@ -37,12 +31,9 @@ from app.utils.google_credentials import (
|
||||||
|
|
||||||
from .base import (
|
from .base import (
|
||||||
calculate_date_range,
|
calculate_date_range,
|
||||||
check_document_by_unique_identifier,
|
|
||||||
check_duplicate_document_by_hash,
|
check_duplicate_document_by_hash,
|
||||||
get_connector_by_id,
|
get_connector_by_id,
|
||||||
get_current_timestamp,
|
|
||||||
logger,
|
logger,
|
||||||
safe_set_chunks,
|
|
||||||
update_connector_last_indexed,
|
update_connector_last_indexed,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
@ -51,13 +42,70 @@ ACCEPTED_GMAIL_CONNECTOR_TYPES = {
|
||||||
SearchSourceConnectorType.COMPOSIO_GMAIL_CONNECTOR,
|
SearchSourceConnectorType.COMPOSIO_GMAIL_CONNECTOR,
|
||||||
}
|
}
|
||||||
|
|
||||||
# Type hint for heartbeat callback
|
|
||||||
HeartbeatCallbackType = Callable[[int], Awaitable[None]]
|
HeartbeatCallbackType = Callable[[int], Awaitable[None]]
|
||||||
|
|
||||||
# Heartbeat interval in seconds
|
|
||||||
HEARTBEAT_INTERVAL_SECONDS = 30
|
HEARTBEAT_INTERVAL_SECONDS = 30
|
||||||
|
|
||||||
|
|
||||||
|
def _build_connector_doc(
|
||||||
|
message: dict,
|
||||||
|
markdown_content: str,
|
||||||
|
*,
|
||||||
|
connector_id: int,
|
||||||
|
search_space_id: int,
|
||||||
|
user_id: str,
|
||||||
|
enable_summary: bool,
|
||||||
|
) -> ConnectorDocument:
|
||||||
|
"""Map a raw Gmail API message dict to a ConnectorDocument."""
|
||||||
|
message_id = message.get("id", "")
|
||||||
|
thread_id = message.get("threadId", "")
|
||||||
|
payload = message.get("payload", {})
|
||||||
|
headers = payload.get("headers", [])
|
||||||
|
|
||||||
|
subject = "No Subject"
|
||||||
|
sender = "Unknown Sender"
|
||||||
|
date_str = "Unknown Date"
|
||||||
|
|
||||||
|
for header in headers:
|
||||||
|
name = header.get("name", "").lower()
|
||||||
|
value = header.get("value", "")
|
||||||
|
if name == "subject":
|
||||||
|
subject = value
|
||||||
|
elif name == "from":
|
||||||
|
sender = value
|
||||||
|
elif name == "date":
|
||||||
|
date_str = value
|
||||||
|
|
||||||
|
metadata = {
|
||||||
|
"message_id": message_id,
|
||||||
|
"thread_id": thread_id,
|
||||||
|
"subject": subject,
|
||||||
|
"sender": sender,
|
||||||
|
"date": date_str,
|
||||||
|
"connector_id": connector_id,
|
||||||
|
"document_type": "Gmail Message",
|
||||||
|
"connector_type": "Google Gmail",
|
||||||
|
}
|
||||||
|
|
||||||
|
fallback_summary = (
|
||||||
|
f"Google Gmail Message: {subject}\n\n"
|
||||||
|
f"From: {sender}\nDate: {date_str}\n\n"
|
||||||
|
f"{markdown_content}"
|
||||||
|
)
|
||||||
|
|
||||||
|
return ConnectorDocument(
|
||||||
|
title=subject,
|
||||||
|
source_markdown=markdown_content,
|
||||||
|
unique_id=message_id,
|
||||||
|
document_type=DocumentType.GOOGLE_GMAIL_CONNECTOR,
|
||||||
|
search_space_id=search_space_id,
|
||||||
|
connector_id=connector_id,
|
||||||
|
created_by_id=user_id,
|
||||||
|
should_summarize=enable_summary,
|
||||||
|
fallback_summary=fallback_summary,
|
||||||
|
metadata=metadata,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
async def index_google_gmail_messages(
|
async def index_google_gmail_messages(
|
||||||
session: AsyncSession,
|
session: AsyncSession,
|
||||||
connector_id: int,
|
connector_id: int,
|
||||||
|
|
@ -80,7 +128,7 @@ async def index_google_gmail_messages(
|
||||||
start_date: Start date for filtering messages (YYYY-MM-DD format)
|
start_date: Start date for filtering messages (YYYY-MM-DD format)
|
||||||
end_date: End date for filtering messages (YYYY-MM-DD format)
|
end_date: End date for filtering messages (YYYY-MM-DD format)
|
||||||
update_last_indexed: Whether to update the last_indexed_at timestamp (default: True)
|
update_last_indexed: Whether to update the last_indexed_at timestamp (default: True)
|
||||||
max_messages: Maximum number of messages to fetch (default: 100)
|
max_messages: Maximum number of messages to fetch (default: 1000)
|
||||||
on_heartbeat_callback: Optional callback to update notification during long-running indexing.
|
on_heartbeat_callback: Optional callback to update notification during long-running indexing.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
|
|
@ -88,7 +136,6 @@ async def index_google_gmail_messages(
|
||||||
"""
|
"""
|
||||||
task_logger = TaskLoggingService(session, search_space_id)
|
task_logger = TaskLoggingService(session, search_space_id)
|
||||||
|
|
||||||
# Log task start
|
|
||||||
log_entry = await task_logger.log_task_start(
|
log_entry = await task_logger.log_task_start(
|
||||||
task_name="google_gmail_messages_indexing",
|
task_name="google_gmail_messages_indexing",
|
||||||
source="connector_indexing_task",
|
source="connector_indexing_task",
|
||||||
|
|
@ -103,7 +150,7 @@ async def index_google_gmail_messages(
|
||||||
)
|
)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Accept both native and Composio Gmail connectors
|
# ── Connector lookup ──────────────────────────────────────────
|
||||||
connector = None
|
connector = None
|
||||||
for ct in ACCEPTED_GMAIL_CONNECTOR_TYPES:
|
for ct in ACCEPTED_GMAIL_CONNECTOR_TYPES:
|
||||||
connector = await get_connector_by_id(session, connector_id, ct)
|
connector = await get_connector_by_id(session, connector_id, ct)
|
||||||
|
|
@ -117,7 +164,7 @@ async def index_google_gmail_messages(
|
||||||
)
|
)
|
||||||
return 0, 0, error_msg
|
return 0, 0, error_msg
|
||||||
|
|
||||||
# Build credentials based on connector type
|
# ── Credential building ───────────────────────────────────────
|
||||||
if connector.connector_type in COMPOSIO_GOOGLE_CONNECTOR_TYPES:
|
if connector.connector_type in COMPOSIO_GOOGLE_CONNECTOR_TYPES:
|
||||||
connected_account_id = connector.config.get("composio_connected_account_id")
|
connected_account_id = connector.config.get("composio_connected_account_id")
|
||||||
if not connected_account_id:
|
if not connected_account_id:
|
||||||
|
|
@ -189,6 +236,7 @@ async def index_google_gmail_messages(
|
||||||
)
|
)
|
||||||
return 0, 0, "Google gmail credentials not found in connector config"
|
return 0, 0, "Google gmail credentials not found in connector config"
|
||||||
|
|
||||||
|
# ── Gmail client init ─────────────────────────────────────────
|
||||||
await task_logger.log_task_progress(
|
await task_logger.log_task_progress(
|
||||||
log_entry,
|
log_entry,
|
||||||
f"Initializing Google gmail client for connector {connector_id}",
|
f"Initializing Google gmail client for connector {connector_id}",
|
||||||
|
|
@ -199,14 +247,11 @@ async def index_google_gmail_messages(
|
||||||
credentials, session, user_id, connector_id
|
credentials, session, user_id, connector_id
|
||||||
)
|
)
|
||||||
|
|
||||||
# Calculate date range using last_indexed_at if dates not provided
|
|
||||||
# This ensures Gmail uses the same date logic as other connectors
|
|
||||||
# (uses last_indexed_at → now, or 365 days back for first-time indexing)
|
|
||||||
calculated_start_date, calculated_end_date = calculate_date_range(
|
calculated_start_date, calculated_end_date = calculate_date_range(
|
||||||
connector, start_date, end_date, default_days_back=365
|
connector, start_date, end_date, default_days_back=365
|
||||||
)
|
)
|
||||||
|
|
||||||
# Fetch recent Google gmail messages
|
# ── Fetch messages ────────────────────────────────────────────
|
||||||
logger.info(
|
logger.info(
|
||||||
f"Fetching emails for connector {connector_id} "
|
f"Fetching emails for connector {connector_id} "
|
||||||
f"from {calculated_start_date} to {calculated_end_date}"
|
f"from {calculated_start_date} to {calculated_end_date}"
|
||||||
|
|
@ -218,7 +263,6 @@ async def index_google_gmail_messages(
|
||||||
)
|
)
|
||||||
|
|
||||||
if error:
|
if error:
|
||||||
# Check if this is an authentication error that requires re-authentication
|
|
||||||
error_message = error
|
error_message = error
|
||||||
error_type = "APIError"
|
error_type = "APIError"
|
||||||
if (
|
if (
|
||||||
|
|
@ -243,263 +287,92 @@ async def index_google_gmail_messages(
|
||||||
|
|
||||||
logger.info(f"Found {len(messages)} Google gmail messages to index")
|
logger.info(f"Found {len(messages)} Google gmail messages to index")
|
||||||
|
|
||||||
documents_indexed = 0
|
# ── Build ConnectorDocuments ──────────────────────────────────
|
||||||
|
connector_docs: list[ConnectorDocument] = []
|
||||||
documents_skipped = 0
|
documents_skipped = 0
|
||||||
documents_failed = 0 # Track messages that failed processing
|
duplicate_content_count = 0
|
||||||
duplicate_content_count = (
|
|
||||||
0 # Track messages skipped due to duplicate content_hash
|
|
||||||
)
|
|
||||||
|
|
||||||
# Heartbeat tracking - update notification periodically to prevent appearing stuck
|
|
||||||
last_heartbeat_time = time.time()
|
|
||||||
|
|
||||||
# =======================================================================
|
|
||||||
# PHASE 1: Analyze all messages, create pending documents
|
|
||||||
# This makes ALL documents visible in the UI immediately with pending status
|
|
||||||
# =======================================================================
|
|
||||||
messages_to_process = [] # List of dicts with document and message data
|
|
||||||
new_documents_created = False
|
|
||||||
|
|
||||||
for message in messages:
|
for message in messages:
|
||||||
try:
|
try:
|
||||||
# Extract message information
|
|
||||||
message_id = message.get("id", "")
|
message_id = message.get("id", "")
|
||||||
thread_id = message.get("threadId", "")
|
|
||||||
|
|
||||||
# Extract headers for subject and sender
|
|
||||||
payload = message.get("payload", {})
|
|
||||||
headers = payload.get("headers", [])
|
|
||||||
|
|
||||||
subject = "No Subject"
|
|
||||||
sender = "Unknown Sender"
|
|
||||||
date_str = "Unknown Date"
|
|
||||||
|
|
||||||
for header in headers:
|
|
||||||
name = header.get("name", "").lower()
|
|
||||||
value = header.get("value", "")
|
|
||||||
if name == "subject":
|
|
||||||
subject = value
|
|
||||||
elif name == "from":
|
|
||||||
sender = value
|
|
||||||
elif name == "date":
|
|
||||||
date_str = value
|
|
||||||
|
|
||||||
if not message_id:
|
if not message_id:
|
||||||
logger.warning(f"Skipping message with missing ID: {subject}")
|
logger.warning("Skipping message with missing ID")
|
||||||
documents_skipped += 1
|
documents_skipped += 1
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# Format message to markdown
|
|
||||||
markdown_content = gmail_connector.format_message_to_markdown(message)
|
markdown_content = gmail_connector.format_message_to_markdown(message)
|
||||||
|
|
||||||
if not markdown_content.strip():
|
if not markdown_content.strip():
|
||||||
logger.warning(f"Skipping message with no content: {subject}")
|
logger.warning(f"Skipping message with no content: {message_id}")
|
||||||
documents_skipped += 1
|
documents_skipped += 1
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# Generate unique identifier hash for this Gmail message
|
doc = _build_connector_doc(
|
||||||
unique_identifier_hash = generate_unique_identifier_hash(
|
message,
|
||||||
DocumentType.GOOGLE_GMAIL_CONNECTOR, message_id, search_space_id
|
markdown_content,
|
||||||
|
connector_id=connector_id,
|
||||||
|
search_space_id=search_space_id,
|
||||||
|
user_id=user_id,
|
||||||
|
enable_summary=connector.enable_summary,
|
||||||
)
|
)
|
||||||
|
|
||||||
# Generate content hash
|
|
||||||
content_hash = generate_content_hash(markdown_content, search_space_id)
|
|
||||||
|
|
||||||
# Check if document with this unique identifier already exists
|
|
||||||
existing_document = await check_document_by_unique_identifier(
|
|
||||||
session, unique_identifier_hash
|
|
||||||
)
|
|
||||||
|
|
||||||
# Fallback: legacy Composio hash
|
|
||||||
if not existing_document:
|
|
||||||
legacy_hash = generate_unique_identifier_hash(
|
|
||||||
DocumentType.COMPOSIO_GMAIL_CONNECTOR,
|
|
||||||
message_id,
|
|
||||||
search_space_id,
|
|
||||||
)
|
|
||||||
existing_document = await check_document_by_unique_identifier(
|
|
||||||
session, legacy_hash
|
|
||||||
)
|
|
||||||
if existing_document:
|
|
||||||
existing_document.unique_identifier_hash = (
|
|
||||||
unique_identifier_hash
|
|
||||||
)
|
|
||||||
if (
|
|
||||||
existing_document.document_type
|
|
||||||
== DocumentType.COMPOSIO_GMAIL_CONNECTOR
|
|
||||||
):
|
|
||||||
existing_document.document_type = (
|
|
||||||
DocumentType.GOOGLE_GMAIL_CONNECTOR
|
|
||||||
)
|
|
||||||
logger.info(
|
|
||||||
f"Migrated legacy Composio Gmail document: {message_id}"
|
|
||||||
)
|
|
||||||
|
|
||||||
if existing_document:
|
|
||||||
# Document exists - check if content has changed
|
|
||||||
if existing_document.content_hash == content_hash:
|
|
||||||
# Ensure status is ready (might have been stuck in processing/pending)
|
|
||||||
if not DocumentStatus.is_state(
|
|
||||||
existing_document.status, DocumentStatus.READY
|
|
||||||
):
|
|
||||||
existing_document.status = DocumentStatus.ready()
|
|
||||||
documents_skipped += 1
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Queue existing document for update (will be set to processing in Phase 2)
|
|
||||||
messages_to_process.append(
|
|
||||||
{
|
|
||||||
"document": existing_document,
|
|
||||||
"is_new": False,
|
|
||||||
"markdown_content": markdown_content,
|
|
||||||
"content_hash": content_hash,
|
|
||||||
"message_id": message_id,
|
|
||||||
"thread_id": thread_id,
|
|
||||||
"subject": subject,
|
|
||||||
"sender": sender,
|
|
||||||
"date_str": date_str,
|
|
||||||
}
|
|
||||||
)
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Document doesn't exist by unique_identifier_hash
|
|
||||||
# Check if a document with the same content_hash exists (from another connector)
|
|
||||||
with session.no_autoflush:
|
with session.no_autoflush:
|
||||||
duplicate_by_content = await check_duplicate_document_by_hash(
|
duplicate = await check_duplicate_document_by_hash(
|
||||||
session, content_hash
|
session, compute_content_hash(doc)
|
||||||
)
|
)
|
||||||
|
if duplicate:
|
||||||
if duplicate_by_content:
|
|
||||||
logger.info(
|
logger.info(
|
||||||
f"Gmail message {subject} already indexed by another connector "
|
f"Gmail message {doc.title} already indexed by another connector "
|
||||||
f"(existing document ID: {duplicate_by_content.id}, "
|
f"(existing document ID: {duplicate.id}, "
|
||||||
f"type: {duplicate_by_content.document_type}). Skipping."
|
f"type: {duplicate.document_type}). Skipping."
|
||||||
)
|
)
|
||||||
duplicate_content_count += 1
|
duplicate_content_count += 1
|
||||||
documents_skipped += 1
|
documents_skipped += 1
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# Create new document with PENDING status (visible in UI immediately)
|
connector_docs.append(doc)
|
||||||
document = Document(
|
|
||||||
search_space_id=search_space_id,
|
|
||||||
title=subject,
|
|
||||||
document_type=DocumentType.GOOGLE_GMAIL_CONNECTOR,
|
|
||||||
document_metadata={
|
|
||||||
"message_id": message_id,
|
|
||||||
"thread_id": thread_id,
|
|
||||||
"subject": subject,
|
|
||||||
"sender": sender,
|
|
||||||
"date": date_str,
|
|
||||||
"connector_id": connector_id,
|
|
||||||
},
|
|
||||||
content="Pending...", # Placeholder until processed
|
|
||||||
content_hash=unique_identifier_hash, # Temporary unique value - updated when ready
|
|
||||||
unique_identifier_hash=unique_identifier_hash,
|
|
||||||
embedding=None,
|
|
||||||
chunks=[], # Empty at creation - safe for async
|
|
||||||
status=DocumentStatus.pending(), # Pending until processing starts
|
|
||||||
updated_at=get_current_timestamp(),
|
|
||||||
created_by_id=user_id,
|
|
||||||
connector_id=connector_id,
|
|
||||||
)
|
|
||||||
session.add(document)
|
|
||||||
new_documents_created = True
|
|
||||||
|
|
||||||
messages_to_process.append(
|
|
||||||
{
|
|
||||||
"document": document,
|
|
||||||
"is_new": True,
|
|
||||||
"markdown_content": markdown_content,
|
|
||||||
"content_hash": content_hash,
|
|
||||||
"message_id": message_id,
|
|
||||||
"thread_id": thread_id,
|
|
||||||
"subject": subject,
|
|
||||||
"sender": sender,
|
|
||||||
"date_str": date_str,
|
|
||||||
}
|
|
||||||
)
|
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Error in Phase 1 for message: {e!s}", exc_info=True)
|
logger.error(f"Error building ConnectorDocument for message: {e!s}", exc_info=True)
|
||||||
documents_failed += 1
|
documents_skipped += 1
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# Commit all pending documents - they all appear in UI now
|
# ── Pipeline: migrate legacy docs + prepare + index ───────────
|
||||||
if new_documents_created:
|
pipeline = IndexingPipelineService(session)
|
||||||
logger.info(
|
|
||||||
f"Phase 1: Committing {len([m for m in messages_to_process if m['is_new']])} pending documents"
|
|
||||||
)
|
|
||||||
await session.commit()
|
|
||||||
|
|
||||||
# =======================================================================
|
await pipeline.migrate_legacy_docs(connector_docs)
|
||||||
# PHASE 2: Process each document one by one
|
|
||||||
# Each document transitions: pending → processing → ready/failed
|
|
||||||
# =======================================================================
|
|
||||||
logger.info(f"Phase 2: Processing {len(messages_to_process)} documents")
|
|
||||||
|
|
||||||
for item in messages_to_process:
|
documents = await pipeline.prepare_for_indexing(connector_docs)
|
||||||
# Send heartbeat periodically
|
|
||||||
|
doc_map = {
|
||||||
|
compute_unique_identifier_hash(cd): cd for cd in connector_docs
|
||||||
|
}
|
||||||
|
|
||||||
|
documents_indexed = 0
|
||||||
|
documents_failed = 0
|
||||||
|
last_heartbeat_time = time.time()
|
||||||
|
|
||||||
|
for document in documents:
|
||||||
if on_heartbeat_callback:
|
if on_heartbeat_callback:
|
||||||
current_time = time.time()
|
current_time = time.time()
|
||||||
if current_time - last_heartbeat_time >= HEARTBEAT_INTERVAL_SECONDS:
|
if current_time - last_heartbeat_time >= HEARTBEAT_INTERVAL_SECONDS:
|
||||||
await on_heartbeat_callback(documents_indexed)
|
await on_heartbeat_callback(documents_indexed)
|
||||||
last_heartbeat_time = current_time
|
last_heartbeat_time = current_time
|
||||||
|
|
||||||
document = item["document"]
|
connector_doc = doc_map.get(document.unique_identifier_hash)
|
||||||
try:
|
if connector_doc is None:
|
||||||
# Set to PROCESSING and commit - shows "processing" in UI for THIS document only
|
logger.warning(
|
||||||
document.status = DocumentStatus.processing()
|
f"No matching ConnectorDocument for document {document.id}, skipping"
|
||||||
await session.commit()
|
)
|
||||||
|
documents_failed += 1
|
||||||
|
continue
|
||||||
|
|
||||||
# Heavy processing (LLM, embeddings, chunks)
|
try:
|
||||||
user_llm = await get_user_long_context_llm(
|
user_llm = await get_user_long_context_llm(
|
||||||
session, user_id, search_space_id
|
session, user_id, search_space_id
|
||||||
)
|
)
|
||||||
|
await pipeline.index(document, connector_doc, user_llm)
|
||||||
if user_llm and connector.enable_summary:
|
|
||||||
document_metadata_for_summary = {
|
|
||||||
"message_id": item["message_id"],
|
|
||||||
"thread_id": item["thread_id"],
|
|
||||||
"subject": item["subject"],
|
|
||||||
"sender": item["sender"],
|
|
||||||
"date": item["date_str"],
|
|
||||||
"document_type": "Gmail Message",
|
|
||||||
"connector_type": "Google Gmail",
|
|
||||||
}
|
|
||||||
(
|
|
||||||
summary_content,
|
|
||||||
summary_embedding,
|
|
||||||
) = await generate_document_summary(
|
|
||||||
item["markdown_content"],
|
|
||||||
user_llm,
|
|
||||||
document_metadata_for_summary,
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
summary_content = f"Google Gmail Message: {item['subject']}\n\nFrom: {item['sender']}\nDate: {item['date_str']}\n\n{item['markdown_content']}"
|
|
||||||
summary_embedding = embed_text(summary_content)
|
|
||||||
|
|
||||||
chunks = await create_document_chunks(item["markdown_content"])
|
|
||||||
|
|
||||||
# Update document to READY with actual content
|
|
||||||
document.title = item["subject"]
|
|
||||||
document.content = summary_content
|
|
||||||
document.content_hash = item["content_hash"]
|
|
||||||
document.embedding = summary_embedding
|
|
||||||
document.document_metadata = {
|
|
||||||
"message_id": item["message_id"],
|
|
||||||
"thread_id": item["thread_id"],
|
|
||||||
"subject": item["subject"],
|
|
||||||
"sender": item["sender"],
|
|
||||||
"date": item["date_str"],
|
|
||||||
"connector_id": connector_id,
|
|
||||||
}
|
|
||||||
await safe_set_chunks(session, document, chunks)
|
|
||||||
document.updated_at = get_current_timestamp()
|
|
||||||
document.status = DocumentStatus.ready()
|
|
||||||
|
|
||||||
documents_indexed += 1
|
documents_indexed += 1
|
||||||
|
|
||||||
# Batch commit every 10 documents (for ready status updates)
|
|
||||||
if documents_indexed % 10 == 0:
|
if documents_indexed % 10 == 0:
|
||||||
logger.info(
|
logger.info(
|
||||||
f"Committing batch: {documents_indexed} Gmail messages processed so far"
|
f"Committing batch: {documents_indexed} Gmail messages processed so far"
|
||||||
|
|
@ -508,21 +381,12 @@ async def index_google_gmail_messages(
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Error processing Gmail message: {e!s}", exc_info=True)
|
logger.error(f"Error processing Gmail message: {e!s}", exc_info=True)
|
||||||
# Mark document as failed with reason (visible in UI)
|
|
||||||
try:
|
|
||||||
document.status = DocumentStatus.failed(str(e))
|
|
||||||
document.updated_at = get_current_timestamp()
|
|
||||||
except Exception as status_error:
|
|
||||||
logger.error(
|
|
||||||
f"Failed to update document status to failed: {status_error}"
|
|
||||||
)
|
|
||||||
documents_failed += 1
|
documents_failed += 1
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# CRITICAL: Always update timestamp (even if 0 documents indexed) so Zero syncs
|
# ── Finalize ──────────────────────────────────────────────────
|
||||||
await update_connector_last_indexed(session, connector, update_last_indexed)
|
await update_connector_last_indexed(session, connector, update_last_indexed)
|
||||||
|
|
||||||
# Final commit for any remaining documents not yet committed in batches
|
|
||||||
logger.info(f"Final commit: Total {documents_indexed} Gmail messages processed")
|
logger.info(f"Final commit: Total {documents_indexed} Gmail messages processed")
|
||||||
try:
|
try:
|
||||||
await session.commit()
|
await session.commit()
|
||||||
|
|
@ -530,22 +394,18 @@ async def index_google_gmail_messages(
|
||||||
"Successfully committed all Google Gmail document changes to database"
|
"Successfully committed all Google Gmail document changes to database"
|
||||||
)
|
)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
# Handle any remaining integrity errors gracefully (race conditions, etc.)
|
|
||||||
if (
|
if (
|
||||||
"duplicate key value violates unique constraint" in str(e).lower()
|
"duplicate key value violates unique constraint" in str(e).lower()
|
||||||
or "uniqueviolationerror" in str(e).lower()
|
or "uniqueviolationerror" in str(e).lower()
|
||||||
):
|
):
|
||||||
logger.warning(
|
logger.warning(
|
||||||
f"Duplicate content_hash detected during final commit. "
|
f"Duplicate content_hash detected during final commit. "
|
||||||
f"This may occur if the same message was indexed by multiple connectors. "
|
|
||||||
f"Rolling back and continuing. Error: {e!s}"
|
f"Rolling back and continuing. Error: {e!s}"
|
||||||
)
|
)
|
||||||
await session.rollback()
|
await session.rollback()
|
||||||
# Don't fail the entire task - some documents may have been successfully indexed
|
|
||||||
else:
|
else:
|
||||||
raise
|
raise
|
||||||
|
|
||||||
# Build warning message if there were issues
|
|
||||||
warning_parts = []
|
warning_parts = []
|
||||||
if duplicate_content_count > 0:
|
if duplicate_content_count > 0:
|
||||||
warning_parts.append(f"{duplicate_content_count} duplicate")
|
warning_parts.append(f"{duplicate_content_count} duplicate")
|
||||||
|
|
@ -555,7 +415,6 @@ async def index_google_gmail_messages(
|
||||||
|
|
||||||
total_processed = documents_indexed
|
total_processed = documents_indexed
|
||||||
|
|
||||||
# Log success
|
|
||||||
await task_logger.log_task_success(
|
await task_logger.log_task_success(
|
||||||
log_entry,
|
log_entry,
|
||||||
f"Successfully completed Google Gmail indexing for connector {connector_id}",
|
f"Successfully completed Google Gmail indexing for connector {connector_id}",
|
||||||
|
|
|
||||||
|
|
@ -3,6 +3,7 @@ import pytest
|
||||||
from app.db import DocumentType
|
from app.db import DocumentType
|
||||||
from app.indexing_pipeline.document_hashing import (
|
from app.indexing_pipeline.document_hashing import (
|
||||||
compute_content_hash,
|
compute_content_hash,
|
||||||
|
compute_identifier_hash,
|
||||||
compute_unique_identifier_hash,
|
compute_unique_identifier_hash,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
@ -61,3 +62,23 @@ def test_different_content_produces_different_content_hash(make_connector_docume
|
||||||
doc_a = make_connector_document(source_markdown="Original content")
|
doc_a = make_connector_document(source_markdown="Original content")
|
||||||
doc_b = make_connector_document(source_markdown="Updated content")
|
doc_b = make_connector_document(source_markdown="Updated content")
|
||||||
assert compute_content_hash(doc_a) != compute_content_hash(doc_b)
|
assert compute_content_hash(doc_a) != compute_content_hash(doc_b)
|
||||||
|
|
||||||
|
|
||||||
|
def test_compute_identifier_hash_matches_connector_doc_hash(make_connector_document):
|
||||||
|
"""Raw-args hash equals ConnectorDocument hash for equivalent inputs."""
|
||||||
|
doc = make_connector_document(
|
||||||
|
document_type=DocumentType.GOOGLE_GMAIL_CONNECTOR,
|
||||||
|
unique_id="msg-123",
|
||||||
|
search_space_id=5,
|
||||||
|
)
|
||||||
|
raw_hash = compute_identifier_hash("GOOGLE_GMAIL_CONNECTOR", "msg-123", 5)
|
||||||
|
assert raw_hash == compute_unique_identifier_hash(doc)
|
||||||
|
|
||||||
|
|
||||||
|
def test_compute_identifier_hash_differs_for_different_inputs():
|
||||||
|
"""Different arguments produce different hashes."""
|
||||||
|
h1 = compute_identifier_hash("GOOGLE_DRIVE_FILE", "file-1", 1)
|
||||||
|
h2 = compute_identifier_hash("GOOGLE_DRIVE_FILE", "file-2", 1)
|
||||||
|
h3 = compute_identifier_hash("GOOGLE_DRIVE_FILE", "file-1", 2)
|
||||||
|
h4 = compute_identifier_hash("COMPOSIO_GOOGLE_DRIVE_CONNECTOR", "file-1", 1)
|
||||||
|
assert len({h1, h2, h3, h4}) == 4
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue