Merge pull request #706 from AnishSarkar22/fix/drive-index

feat: enhance Google Drive indexing
This commit is contained in:
Rohan Verma 2026-01-18 22:15:03 -08:00 committed by GitHub
commit 87a174a1fd
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
17 changed files with 1116 additions and 279 deletions

View file

@ -58,7 +58,7 @@ async def get_changes(
params = { params = {
"pageToken": page_token, "pageToken": page_token,
"pageSize": 100, "pageSize": 100,
"fields": "nextPageToken, newStartPageToken, changes(fileId, removed, file(id, name, mimeType, modifiedTime, size, webViewLink, parents, trashed))", "fields": "nextPageToken, newStartPageToken, changes(fileId, removed, file(id, name, mimeType, modifiedTime, md5Checksum, size, webViewLink, parents, trashed))",
"supportsAllDrives": True, "supportsAllDrives": True,
"includeItemsFromAllDrives": True, "includeItemsFromAllDrives": True,
} }

View file

@ -47,7 +47,7 @@ class GoogleDriveClient:
async def list_files( async def list_files(
self, self,
query: str = "", query: str = "",
fields: str = "nextPageToken, files(id, name, mimeType, modifiedTime, size, webViewLink, parents, owners, createdTime, description)", fields: str = "nextPageToken, files(id, name, mimeType, modifiedTime, md5Checksum, size, webViewLink, parents, owners, createdTime, description)",
page_size: int = 100, page_size: int = 100,
page_token: str | None = None, page_token: str | None = None,
) -> tuple[list[dict[str, Any]], str | None, str | None]: ) -> tuple[list[dict[str, Any]], str | None, str | None]:

View file

@ -102,6 +102,8 @@ async def download_and_process_file(
connector_info["metadata"]["file_size"] = file["size"] connector_info["metadata"]["file_size"] = file["size"]
if "webViewLink" in file: if "webViewLink" in file:
connector_info["metadata"]["web_view_link"] = file["webViewLink"] connector_info["metadata"]["web_view_link"] = file["webViewLink"]
if "md5Checksum" in file:
connector_info["metadata"]["md5_checksum"] = file["md5Checksum"]
if is_google_workspace_file(mime_type): if is_google_workspace_file(mime_type):
connector_info["metadata"]["exported_as"] = "pdf" connector_info["metadata"]["exported_as"] = "pdf"

View file

@ -157,7 +157,7 @@ async def get_file_by_id(
try: try:
file, error = await client.get_file_metadata( file, error = await client.get_file_metadata(
file_id, file_id,
fields="id, name, mimeType, parents, createdTime, modifiedTime, size, webViewLink, iconLink", fields="id, name, mimeType, parents, createdTime, modifiedTime, md5Checksum, size, webViewLink, iconLink",
) )
if error: if error:
@ -228,7 +228,7 @@ async def list_folder_contents(
while True: while True:
items, next_token, error = await client.list_files( items, next_token, error = await client.list_files(
query=query, query=query,
fields="files(id, name, mimeType, parents, createdTime, modifiedTime, size, webViewLink, iconLink)", fields="files(id, name, mimeType, parents, createdTime, modifiedTime, md5Checksum, size, webViewLink, iconLink)",
page_size=1000, # Max allowed by Google Drive API page_size=1000, # Max allowed by Google Drive API
page_token=page_token, page_token=page_token,
) )

View file

@ -1716,7 +1716,7 @@ async def run_google_drive_indexing(
connector_id: int, connector_id: int,
search_space_id: int, search_space_id: int,
user_id: str, user_id: str,
items_dict: dict, # Dictionary with 'folders' and 'files' lists items_dict: dict, # Dictionary with 'folders', 'files', and 'indexing_options'
): ):
"""Runs the Google Drive indexing task for folders and files with notifications.""" """Runs the Google Drive indexing task for folders and files with notifications."""
from uuid import UUID from uuid import UUID
@ -1730,6 +1730,7 @@ async def run_google_drive_indexing(
# Parse the structured data # Parse the structured data
items = GoogleDriveIndexRequest(**items_dict) items = GoogleDriveIndexRequest(**items_dict)
indexing_options = items.indexing_options
total_indexed = 0 total_indexed = 0
errors = [] errors = []
@ -1765,7 +1766,7 @@ async def run_google_drive_indexing(
stage="fetching", stage="fetching",
) )
# Index each folder # Index each folder with indexing options
for folder in items.folders: for folder in items.folders:
try: try:
indexed_count, error_message = await index_google_drive_files( indexed_count, error_message = await index_google_drive_files(
@ -1775,8 +1776,10 @@ async def run_google_drive_indexing(
user_id, user_id,
folder_id=folder.id, folder_id=folder.id,
folder_name=folder.name, folder_name=folder.name,
use_delta_sync=True, use_delta_sync=indexing_options.incremental_sync,
update_last_indexed=False, update_last_indexed=False,
max_files=indexing_options.max_files_per_folder,
include_subfolders=indexing_options.include_subfolders,
) )
if error_message: if error_message:
errors.append(f"Folder '{folder.name}': {error_message}") errors.append(f"Folder '{folder.name}': {error_message}")
@ -1837,6 +1840,8 @@ async def run_google_drive_indexing(
# Update notification on completion # Update notification on completion
if notification: if notification:
# Refresh notification to reload attributes that may have been expired by earlier commits
await session.refresh(notification)
await NotificationService.connector_indexing.notify_indexing_completed( await NotificationService.connector_indexing.notify_indexing_completed(
session=session, session=session,
notification=notification, notification=notification,

View file

@ -10,7 +10,7 @@ from .documents import (
ExtensionDocumentMetadata, ExtensionDocumentMetadata,
PaginatedResponse, PaginatedResponse,
) )
from .google_drive import DriveItem, GoogleDriveIndexRequest from .google_drive import DriveItem, GoogleDriveIndexingOptions, GoogleDriveIndexRequest
from .logs import LogBase, LogCreate, LogFilter, LogRead, LogUpdate from .logs import LogBase, LogCreate, LogFilter, LogRead, LogUpdate
from .new_chat import ( from .new_chat import (
ChatMessage, ChatMessage,
@ -94,6 +94,7 @@ __all__ = [
"ExtensionDocumentMetadata", "ExtensionDocumentMetadata",
"GlobalNewLLMConfigRead", "GlobalNewLLMConfigRead",
"GoogleDriveIndexRequest", "GoogleDriveIndexRequest",
"GoogleDriveIndexingOptions",
# Base schemas # Base schemas
"IDModel", "IDModel",
# RBAC schemas # RBAC schemas

View file

@ -10,6 +10,25 @@ class DriveItem(BaseModel):
name: str = Field(..., description="Item display name") name: str = Field(..., description="Item display name")
class GoogleDriveIndexingOptions(BaseModel):
"""Indexing options for Google Drive connector."""
max_files_per_folder: int = Field(
default=100,
ge=1,
le=1000,
description="Maximum number of files to index from each folder (1-1000)",
)
incremental_sync: bool = Field(
default=True,
description="Only sync changes since last index (faster). Disable for a full re-index.",
)
include_subfolders: bool = Field(
default=True,
description="Recursively index files in subfolders of selected folders",
)
class GoogleDriveIndexRequest(BaseModel): class GoogleDriveIndexRequest(BaseModel):
"""Request body for indexing Google Drive content.""" """Request body for indexing Google Drive content."""
@ -19,6 +38,10 @@ class GoogleDriveIndexRequest(BaseModel):
files: list[DriveItem] = Field( files: list[DriveItem] = Field(
default_factory=list, description="List of specific files to index" default_factory=list, description="List of specific files to index"
) )
indexing_options: GoogleDriveIndexingOptions = Field(
default_factory=GoogleDriveIndexingOptions,
description="Indexing configuration options",
)
def has_items(self) -> bool: def has_items(self) -> bool:
"""Check if any items are selected.""" """Check if any items are selected."""

View file

@ -461,7 +461,7 @@ def index_google_drive_files_task(
connector_id: int, connector_id: int,
search_space_id: int, search_space_id: int,
user_id: str, user_id: str,
items_dict: dict, # Dictionary with 'folders' and 'files' lists items_dict: dict, # Dictionary with 'folders', 'files', and 'indexing_options'
): ):
"""Celery task to index Google Drive folders and files.""" """Celery task to index Google Drive folders and files."""
import asyncio import asyncio
@ -486,7 +486,7 @@ async def _index_google_drive_files(
connector_id: int, connector_id: int,
search_space_id: int, search_space_id: int,
user_id: str, user_id: str,
items_dict: dict, # Dictionary with 'folders' and 'files' lists items_dict: dict, # Dictionary with 'folders', 'files', and 'indexing_options'
): ):
"""Index Google Drive folders and files with new session.""" """Index Google Drive folders and files with new session."""
from app.routes.search_source_connectors_routes import ( from app.routes.search_source_connectors_routes import (

View file

@ -72,6 +72,7 @@ async def _check_and_trigger_schedules():
index_elasticsearch_documents_task, index_elasticsearch_documents_task,
index_github_repos_task, index_github_repos_task,
index_google_calendar_events_task, index_google_calendar_events_task,
index_google_drive_files_task,
index_google_gmail_messages_task, index_google_gmail_messages_task,
index_jira_issues_task, index_jira_issues_task,
index_linear_issues_task, index_linear_issues_task,
@ -96,6 +97,7 @@ async def _check_and_trigger_schedules():
SearchSourceConnectorType.LUMA_CONNECTOR: index_luma_events_task, SearchSourceConnectorType.LUMA_CONNECTOR: index_luma_events_task,
SearchSourceConnectorType.ELASTICSEARCH_CONNECTOR: index_elasticsearch_documents_task, SearchSourceConnectorType.ELASTICSEARCH_CONNECTOR: index_elasticsearch_documents_task,
SearchSourceConnectorType.WEBCRAWLER_CONNECTOR: index_crawled_urls_task, SearchSourceConnectorType.WEBCRAWLER_CONNECTOR: index_crawled_urls_task,
SearchSourceConnectorType.GOOGLE_DRIVE_CONNECTOR: index_google_drive_files_task,
} }
# Trigger indexing for each due connector # Trigger indexing for each due connector
@ -106,13 +108,57 @@ async def _check_and_trigger_schedules():
f"Triggering periodic indexing for connector {connector.id} " f"Triggering periodic indexing for connector {connector.id} "
f"({connector.connector_type.value})" f"({connector.connector_type.value})"
) )
task.delay(
connector.id, # Special handling for Google Drive - uses config for folder/file selection
connector.search_space_id, if (
str(connector.user_id), connector.connector_type
None, # start_date - uses last_indexed_at == SearchSourceConnectorType.GOOGLE_DRIVE_CONNECTOR
None, # end_date - uses now ):
) connector_config = connector.config or {}
selected_folders = connector_config.get("selected_folders", [])
selected_files = connector_config.get("selected_files", [])
indexing_options = connector_config.get(
"indexing_options",
{
"max_files_per_folder": 100,
"incremental_sync": True,
"include_subfolders": True,
},
)
if selected_folders or selected_files:
task.delay(
connector.id,
connector.search_space_id,
str(connector.user_id),
{
"folders": selected_folders,
"files": selected_files,
"indexing_options": indexing_options,
},
)
else:
# No folders/files selected - skip indexing but still update next_scheduled_at
# to prevent checking every minute
logger.info(
f"Google Drive connector {connector.id} has no folders or files selected, "
"skipping periodic indexing (will check again at next scheduled time)"
)
from datetime import timedelta
connector.next_scheduled_at = now + timedelta(
minutes=connector.indexing_frequency_minutes
)
await session.commit()
continue
else:
task.delay(
connector.id,
connector.search_space_id,
str(connector.user_id),
None, # start_date - uses last_indexed_at
None, # end_date - uses now
)
# Update next_scheduled_at for next run # Update next_scheduled_at for next run
from datetime import timedelta from datetime import timedelta

View file

@ -37,6 +37,7 @@ async def index_google_drive_files(
use_delta_sync: bool = True, use_delta_sync: bool = True,
update_last_indexed: bool = True, update_last_indexed: bool = True,
max_files: int = 500, max_files: int = 500,
include_subfolders: bool = False,
) -> tuple[int, str | None]: ) -> tuple[int, str | None]:
""" """
Index Google Drive files for a specific connector. Index Google Drive files for a specific connector.
@ -51,6 +52,7 @@ async def index_google_drive_files(
use_delta_sync: Whether to use change tracking for incremental sync use_delta_sync: Whether to use change tracking for incremental sync
update_last_indexed: Whether to update last_indexed_at timestamp update_last_indexed: Whether to update last_indexed_at timestamp
max_files: Maximum number of files to index max_files: Maximum number of files to index
include_subfolders: Whether to recursively index files in subfolders
Returns: Returns:
Tuple of (number_of_indexed_files, error_message) Tuple of (number_of_indexed_files, error_message)
@ -144,6 +146,7 @@ async def index_google_drive_files(
task_logger=task_logger, task_logger=task_logger,
log_entry=log_entry, log_entry=log_entry,
max_files=max_files, max_files=max_files,
include_subfolders=include_subfolders,
) )
else: else:
logger.info(f"Using full scan for connector {connector_id}") logger.info(f"Using full scan for connector {connector_id}")
@ -159,6 +162,7 @@ async def index_google_drive_files(
task_logger=task_logger, task_logger=task_logger,
log_entry=log_entry, log_entry=log_entry,
max_files=max_files, max_files=max_files,
include_subfolders=include_subfolders,
) )
documents_indexed, documents_skipped = result documents_indexed, documents_skipped = result
@ -168,6 +172,9 @@ async def index_google_drive_files(
if new_token and not token_error: if new_token and not token_error:
from sqlalchemy.orm.attributes import flag_modified from sqlalchemy.orm.attributes import flag_modified
# Refresh connector to reload attributes that may have been expired by earlier commits
await session.refresh(connector)
if "folder_tokens" not in connector.config: if "folder_tokens" not in connector.config:
connector.config["folder_tokens"] = {} connector.config["folder_tokens"] = {}
connector.config["folder_tokens"][target_folder_id] = new_token connector.config["folder_tokens"][target_folder_id] = new_token
@ -375,60 +382,89 @@ async def _index_full_scan(
task_logger: TaskLoggingService, task_logger: TaskLoggingService,
log_entry: any, log_entry: any,
max_files: int, max_files: int,
include_subfolders: bool = False,
) -> tuple[int, int]: ) -> tuple[int, int]:
"""Perform full scan indexing of a folder.""" """Perform full scan indexing of a folder."""
await task_logger.log_task_progress( await task_logger.log_task_progress(
log_entry, log_entry,
f"Starting full scan of folder: {folder_name}", f"Starting full scan of folder: {folder_name} (include_subfolders={include_subfolders})",
{"stage": "full_scan", "folder_id": folder_id}, {
"stage": "full_scan",
"folder_id": folder_id,
"include_subfolders": include_subfolders,
},
) )
documents_indexed = 0 documents_indexed = 0
documents_skipped = 0 documents_skipped = 0
page_token = None
files_processed = 0 files_processed = 0
while files_processed < max_files: # Queue of folders to process: (folder_id, folder_name)
files, next_token, error = await get_files_in_folder( folders_to_process = [(folder_id, folder_name)]
drive_client, folder_id, include_subfolders=False, page_token=page_token
)
if error: while folders_to_process and files_processed < max_files:
logger.error(f"Error listing files: {error}") current_folder_id, current_folder_name = folders_to_process.pop(0)
break logger.info(f"Processing folder: {current_folder_name} ({current_folder_id})")
page_token = None
if not files: while files_processed < max_files:
break # Get files and folders in current folder
# include_subfolders=True here so we get folder items to queue them
for file in files: files, next_token, error = await get_files_in_folder(
if files_processed >= max_files: drive_client,
break current_folder_id,
include_subfolders=True,
files_processed += 1 page_token=page_token,
indexed, skipped = await _process_single_file(
drive_client=drive_client,
session=session,
file=file,
connector_id=connector_id,
search_space_id=search_space_id,
user_id=user_id,
task_logger=task_logger,
log_entry=log_entry,
) )
documents_indexed += indexed if error:
documents_skipped += skipped logger.error(f"Error listing files in {current_folder_name}: {error}")
break
if documents_indexed % 10 == 0 and documents_indexed > 0: if not files:
await session.commit() break
logger.info(
f"Committed batch: {documents_indexed} files indexed so far" for file in files:
if files_processed >= max_files:
break
mime_type = file.get("mimeType", "")
# If this is a folder and include_subfolders is enabled, queue it for processing
if mime_type == "application/vnd.google-apps.folder":
if include_subfolders:
folders_to_process.append(
(file["id"], file.get("name", "Unknown"))
)
logger.debug(f"Queued subfolder: {file.get('name', 'Unknown')}")
continue
# Process the file
files_processed += 1
indexed, skipped = await _process_single_file(
drive_client=drive_client,
session=session,
file=file,
connector_id=connector_id,
search_space_id=search_space_id,
user_id=user_id,
task_logger=task_logger,
log_entry=log_entry,
) )
page_token = next_token documents_indexed += indexed
if not page_token: documents_skipped += skipped
break
if documents_indexed % 10 == 0 and documents_indexed > 0:
await session.commit()
logger.info(
f"Committed batch: {documents_indexed} files indexed so far"
)
page_token = next_token
if not page_token:
break
logger.info( logger.info(
f"Full scan complete: {documents_indexed} indexed, {documents_skipped} skipped" f"Full scan complete: {documents_indexed} indexed, {documents_skipped} skipped"
@ -448,8 +484,13 @@ async def _index_with_delta_sync(
task_logger: TaskLoggingService, task_logger: TaskLoggingService,
log_entry: any, log_entry: any,
max_files: int, max_files: int,
include_subfolders: bool = False,
) -> tuple[int, int]: ) -> tuple[int, int]:
"""Perform delta sync indexing using change tracking.""" """Perform delta sync indexing using change tracking.
Note: include_subfolders is accepted for API consistency but delta sync
automatically tracks changes across all folders including subfolders.
"""
await task_logger.log_task_progress( await task_logger.log_task_progress(
log_entry, log_entry,
f"Starting delta sync from token: {start_page_token[:20]}...", f"Starting delta sync from token: {start_page_token[:20]}...",
@ -515,6 +556,131 @@ async def _index_with_delta_sync(
return documents_indexed, documents_skipped return documents_indexed, documents_skipped
async def _check_rename_only_update(
session: AsyncSession,
file: dict,
search_space_id: int,
) -> tuple[bool, str | None]:
"""
Check if a file only needs a rename update (no content change).
Uses md5Checksum comparison (preferred) or modifiedTime (fallback for Google Workspace files)
to detect if content has changed. This optimization prevents unnecessary ETL API calls
(Docling/LlamaCloud) for rename-only operations.
Args:
session: Database session
file: File metadata from Google Drive API
search_space_id: ID of the search space
Returns:
Tuple of (is_rename_only, message)
- (True, message): Only filename changed, document was updated
- (False, None): Content changed or new file, needs full processing
"""
from sqlalchemy import select
from sqlalchemy.orm.attributes import flag_modified
from app.db import Document
file_id = file.get("id")
file_name = file.get("name", "Unknown")
incoming_md5 = file.get("md5Checksum") # None for Google Workspace files
incoming_modified_time = file.get("modifiedTime")
if not file_id:
return False, None
# Try to find existing document by file_id-based hash (primary method)
primary_hash = generate_unique_identifier_hash(
DocumentType.GOOGLE_DRIVE_FILE, file_id, search_space_id
)
existing_document = await check_document_by_unique_identifier(session, primary_hash)
# If not found by primary hash, try searching by metadata (for legacy documents)
if not existing_document:
result = await session.execute(
select(Document).where(
Document.search_space_id == search_space_id,
Document.document_type == DocumentType.GOOGLE_DRIVE_FILE,
Document.document_metadata["google_drive_file_id"].astext == file_id,
)
)
existing_document = result.scalar_one_or_none()
if existing_document:
logger.debug(f"Found legacy document by metadata for file_id: {file_id}")
if not existing_document:
# New file, needs full processing
return False, None
# Get stored checksums/timestamps from document metadata
doc_metadata = existing_document.document_metadata or {}
stored_md5 = doc_metadata.get("md5_checksum")
stored_modified_time = doc_metadata.get("modified_time")
# Determine if content changed using md5Checksum (preferred) or modifiedTime (fallback)
content_unchanged = False
if incoming_md5 and stored_md5:
# Best case: Compare md5 checksums (only changes when content changes, not on rename)
content_unchanged = incoming_md5 == stored_md5
logger.debug(f"MD5 comparison for {file_name}: unchanged={content_unchanged}")
elif incoming_md5 and not stored_md5:
# Have incoming md5 but no stored md5 (legacy doc) - need to reprocess to store it
logger.debug(
f"No stored md5 for {file_name}, will reprocess to store md5_checksum"
)
return False, None
elif not incoming_md5:
# Google Workspace file (no md5Checksum available) - fall back to modifiedTime
# Note: modifiedTime is less reliable as it changes on rename too, but it's the best we have
if incoming_modified_time and stored_modified_time:
content_unchanged = incoming_modified_time == stored_modified_time
logger.debug(
f"ModifiedTime fallback for Google Workspace file {file_name}: unchanged={content_unchanged}"
)
else:
# No stored modifiedTime (legacy) - reprocess to store it
return False, None
if content_unchanged:
# Content hasn't changed - check if filename changed
old_name = doc_metadata.get("FILE_NAME") or doc_metadata.get(
"google_drive_file_name"
)
if old_name and old_name != file_name:
# Rename-only update - update the document without re-processing
existing_document.title = file_name
if not existing_document.document_metadata:
existing_document.document_metadata = {}
existing_document.document_metadata["FILE_NAME"] = file_name
existing_document.document_metadata["google_drive_file_name"] = file_name
# Also update modified_time for Google Workspace files (since it changed on rename)
if incoming_modified_time:
existing_document.document_metadata["modified_time"] = (
incoming_modified_time
)
flag_modified(existing_document, "document_metadata")
await session.commit()
logger.info(
f"Rename-only update: '{old_name}''{file_name}' (skipped ETL)"
)
return (
True,
f"File renamed: '{old_name}''{file_name}' (no content change)",
)
else:
# Neither content nor name changed
logger.debug(f"File unchanged: {file_name}")
return True, "File unchanged (same content and name)"
# Content changed - needs full processing
return False, None
async def _process_single_file( async def _process_single_file(
drive_client: GoogleDriveClient, drive_client: GoogleDriveClient,
session: AsyncSession, session: AsyncSession,
@ -537,6 +703,27 @@ async def _process_single_file(
try: try:
logger.info(f"Processing file: {file_name} ({mime_type})") logger.info(f"Processing file: {file_name} ({mime_type})")
# Early check: Is this a rename-only update?
# This optimization prevents downloading and ETL processing for files
# where only the name changed but content is the same.
is_rename_only, rename_message = await _check_rename_only_update(
session=session,
file=file,
search_space_id=search_space_id,
)
if is_rename_only:
await task_logger.log_task_progress(
log_entry,
f"Skipped ETL for {file_name}: {rename_message}",
{"status": "rename_only", "reason": rename_message},
)
# Return 1 for renamed files (they are "indexed" in the sense that they're updated)
# Return 0 for unchanged files
if "renamed" in (rename_message or "").lower():
return 1, 0
return 0, 1
_, error, _ = await download_and_process_file( _, error, _ = await download_and_process_file(
client=drive_client, client=drive_client,
file=file, file=file,
@ -564,7 +751,15 @@ async def _process_single_file(
async def _remove_document(session: AsyncSession, file_id: str, search_space_id: int): async def _remove_document(session: AsyncSession, file_id: str, search_space_id: int):
"""Remove a document that was deleted in Drive.""" """Remove a document that was deleted in Drive.
Handles both new (file_id-based) and legacy (filename-based) hash schemes.
"""
from sqlalchemy import select
from app.db import Document
# First try with file_id-based hash (new method)
unique_identifier_hash = generate_unique_identifier_hash( unique_identifier_hash = generate_unique_identifier_hash(
DocumentType.GOOGLE_DRIVE_FILE, file_id, search_space_id DocumentType.GOOGLE_DRIVE_FILE, file_id, search_space_id
) )
@ -573,6 +768,19 @@ async def _remove_document(session: AsyncSession, file_id: str, search_space_id:
session, unique_identifier_hash session, unique_identifier_hash
) )
# If not found, search by metadata (for legacy documents with filename-based hash)
if not existing_document:
result = await session.execute(
select(Document).where(
Document.search_space_id == search_space_id,
Document.document_type == DocumentType.GOOGLE_DRIVE_FILE,
Document.document_metadata["google_drive_file_id"].astext == file_id,
)
)
existing_document = result.scalar_one_or_none()
if existing_document:
logger.info(f"Found legacy document by metadata for file_id: {file_id}")
if existing_document: if existing_document:
await session.delete(existing_document) await session.delete(existing_document)
logger.info(f"Removed deleted file document: {file_id}") logger.info(f"Removed deleted file document: {file_id}")

View file

@ -31,6 +31,7 @@ from app.utils.document_converters import (
from .base import ( from .base import (
check_document_by_unique_identifier, check_document_by_unique_identifier,
check_duplicate_document,
get_current_timestamp, get_current_timestamp,
) )
from .markdown_processor import add_received_markdown_file_document from .markdown_processor import add_received_markdown_file_document
@ -49,6 +50,160 @@ LLAMACLOUD_RETRYABLE_EXCEPTIONS = (
) )
def get_google_drive_unique_identifier(
connector: dict | None,
filename: str,
search_space_id: int,
) -> tuple[str, str | None]:
"""
Get unique identifier hash for a file, with special handling for Google Drive.
For Google Drive files, uses file_id as the unique identifier (doesn't change on rename).
For other files, uses filename.
Args:
connector: Optional connector info dict with type and metadata
filename: The filename (used for non-Google Drive files or as fallback)
search_space_id: The search space ID
Returns:
Tuple of (primary_hash, legacy_hash or None)
- For Google Drive: (file_id_based_hash, filename_based_hash for migration)
- For other sources: (filename_based_hash, None)
"""
if connector and connector.get("type") == DocumentType.GOOGLE_DRIVE_FILE:
metadata = connector.get("metadata", {})
file_id = metadata.get("google_drive_file_id")
if file_id:
# New method: use file_id as unique identifier (doesn't change on rename)
primary_hash = generate_unique_identifier_hash(
DocumentType.GOOGLE_DRIVE_FILE, file_id, search_space_id
)
# Legacy method: for backward compatibility with existing documents
# that were indexed with filename-based hash
legacy_hash = generate_unique_identifier_hash(
DocumentType.GOOGLE_DRIVE_FILE, filename, search_space_id
)
return primary_hash, legacy_hash
# For non-Google Drive files, use filename as before
primary_hash = generate_unique_identifier_hash(
DocumentType.FILE, filename, search_space_id
)
return primary_hash, None
async def handle_existing_document_update(
session: AsyncSession,
existing_document: Document,
content_hash: str,
connector: dict | None,
filename: str,
primary_hash: str,
) -> tuple[bool, Document | None]:
"""
Handle update logic for an existing document.
Args:
session: Database session
existing_document: The existing document found in database
content_hash: Hash of the new content
connector: Optional connector info
filename: Current filename
primary_hash: The primary hash (file_id based for Google Drive)
Returns:
Tuple of (should_skip_processing, document_to_return)
- (True, document): Content unchanged, just return existing document
- (False, None): Content changed, need to re-process
"""
# Check if this document needs hash migration (found via legacy hash)
if existing_document.unique_identifier_hash != primary_hash:
existing_document.unique_identifier_hash = primary_hash
logging.info(f"Migrated document to file_id-based identifier: {filename}")
# Check if content has changed
if existing_document.content_hash == content_hash:
# Content unchanged - check if we need to update metadata (e.g., filename changed)
if connector and connector.get("type") == DocumentType.GOOGLE_DRIVE_FILE:
connector_metadata = connector.get("metadata", {})
new_name = connector_metadata.get("google_drive_file_name")
# Check both possible keys for old name (FILE_NAME is used in stored documents)
doc_metadata = existing_document.document_metadata or {}
old_name = doc_metadata.get("FILE_NAME") or doc_metadata.get(
"google_drive_file_name"
)
if new_name and old_name and old_name != new_name:
# File was renamed - update title and metadata, skip expensive processing
from sqlalchemy.orm.attributes import flag_modified
existing_document.title = new_name
if not existing_document.document_metadata:
existing_document.document_metadata = {}
existing_document.document_metadata["FILE_NAME"] = new_name
existing_document.document_metadata["google_drive_file_name"] = new_name
flag_modified(existing_document, "document_metadata")
await session.commit()
logging.info(
f"File renamed in Google Drive: '{old_name}''{new_name}' (no re-processing needed)"
)
logging.info(f"Document for file {filename} unchanged. Skipping.")
return True, existing_document
else:
# Content has changed - need to re-process
logging.info(f"Content changed for file {filename}. Updating document.")
return False, None
async def find_existing_document_with_migration(
session: AsyncSession,
primary_hash: str,
legacy_hash: str | None,
content_hash: str | None = None,
) -> Document | None:
"""
Find existing document, checking both new hash and legacy hash for migration,
with fallback to content_hash for cross-source deduplication.
Args:
session: Database session
primary_hash: The primary hash (file_id based for Google Drive)
legacy_hash: The legacy hash (filename based) for migration, or None
content_hash: The content hash for fallback deduplication, or None
Returns:
Existing document if found, None otherwise
"""
# First check with primary hash (new method)
existing_document = await check_document_by_unique_identifier(session, primary_hash)
# If not found and we have a legacy hash, check with that (migration path)
if not existing_document and legacy_hash:
existing_document = await check_document_by_unique_identifier(
session, legacy_hash
)
if existing_document:
logging.info(
"Found legacy document (filename-based hash), will migrate to file_id-based hash"
)
# Fallback: check by content_hash to catch duplicates from different sources
# This prevents unique constraint violations when the same content exists
# under a different unique_identifier (e.g., manual upload vs Google Drive)
if not existing_document and content_hash:
existing_document = await check_duplicate_document(session, content_hash)
if existing_document:
logging.info(
f"Found duplicate content from different source (content_hash match). "
f"Original document ID: {existing_document.id}, type: {existing_document.document_type}"
)
return existing_document
async def parse_with_llamacloud_retry( async def parse_with_llamacloud_retry(
file_path: str, file_path: str,
estimated_pages: int, estimated_pages: int,
@ -158,6 +313,7 @@ async def add_received_file_document_using_unstructured(
unstructured_processed_elements: list[LangChainDocument], unstructured_processed_elements: list[LangChainDocument],
search_space_id: int, search_space_id: int,
user_id: str, user_id: str,
connector: dict | None = None,
) -> Document | None: ) -> Document | None:
""" """
Process and store a file document using Unstructured service. Process and store a file document using Unstructured service.
@ -168,6 +324,7 @@ async def add_received_file_document_using_unstructured(
unstructured_processed_elements: Processed elements from Unstructured unstructured_processed_elements: Processed elements from Unstructured
search_space_id: ID of the search space search_space_id: ID of the search space
user_id: ID of the user user_id: ID of the user
connector: Optional connector info for Google Drive files
Returns: Returns:
Document object if successful, None if failed Document object if successful, None if failed
@ -177,29 +334,32 @@ async def add_received_file_document_using_unstructured(
unstructured_processed_elements unstructured_processed_elements
) )
# Generate unique identifier hash for this file # Generate unique identifier hash (uses file_id for Google Drive, filename for others)
unique_identifier_hash = generate_unique_identifier_hash( primary_hash, legacy_hash = get_google_drive_unique_identifier(
DocumentType.FILE, file_name, search_space_id connector, file_name, search_space_id
) )
# Generate content hash # Generate content hash
content_hash = generate_content_hash(file_in_markdown, search_space_id) content_hash = generate_content_hash(file_in_markdown, search_space_id)
# Check if document with this unique identifier already exists # Check if document exists (with migration support for Google Drive and content_hash fallback)
existing_document = await check_document_by_unique_identifier( existing_document = await find_existing_document_with_migration(
session, unique_identifier_hash session, primary_hash, legacy_hash, content_hash
) )
if existing_document: if existing_document:
# Document exists - check if content has changed # Handle existing document (rename detection, content change check)
if existing_document.content_hash == content_hash: should_skip, doc = await handle_existing_document_update(
logging.info(f"Document for file {file_name} unchanged. Skipping.") session,
return existing_document existing_document,
else: content_hash,
# Content has changed - update the existing document connector,
logging.info( file_name,
f"Content changed for file {file_name}. Updating document." primary_hash,
) )
if should_skip:
return doc
# Content changed - continue to update
# Get user's long context LLM (needed for both create and update) # Get user's long context LLM (needed for both create and update)
user_llm = await get_user_long_context_llm(session, user_id, search_space_id) user_llm = await get_user_long_context_llm(session, user_id, search_space_id)
@ -251,10 +411,15 @@ async def add_received_file_document_using_unstructured(
document = existing_document document = existing_document
else: else:
# Create new document # Create new document
# Determine document type based on connector
doc_type = DocumentType.FILE
if connector and connector.get("type") == DocumentType.GOOGLE_DRIVE_FILE:
doc_type = DocumentType.GOOGLE_DRIVE_FILE
document = Document( document = Document(
search_space_id=search_space_id, search_space_id=search_space_id,
title=file_name, title=file_name,
document_type=DocumentType.FILE, document_type=doc_type,
document_metadata={ document_metadata={
"FILE_NAME": file_name, "FILE_NAME": file_name,
"ETL_SERVICE": "UNSTRUCTURED", "ETL_SERVICE": "UNSTRUCTURED",
@ -263,7 +428,7 @@ async def add_received_file_document_using_unstructured(
embedding=summary_embedding, embedding=summary_embedding,
chunks=chunks, chunks=chunks,
content_hash=content_hash, content_hash=content_hash,
unique_identifier_hash=unique_identifier_hash, unique_identifier_hash=primary_hash,
blocknote_document=blocknote_json, blocknote_document=blocknote_json,
content_needs_reindexing=False, content_needs_reindexing=False,
updated_at=get_current_timestamp(), updated_at=get_current_timestamp(),
@ -288,6 +453,7 @@ async def add_received_file_document_using_llamacloud(
llamacloud_markdown_document: str, llamacloud_markdown_document: str,
search_space_id: int, search_space_id: int,
user_id: str, user_id: str,
connector: dict | None = None,
) -> Document | None: ) -> Document | None:
""" """
Process and store document content parsed by LlamaCloud. Process and store document content parsed by LlamaCloud.
@ -298,6 +464,7 @@ async def add_received_file_document_using_llamacloud(
llamacloud_markdown_document: Markdown content from LlamaCloud parsing llamacloud_markdown_document: Markdown content from LlamaCloud parsing
search_space_id: ID of the search space search_space_id: ID of the search space
user_id: ID of the user user_id: ID of the user
connector: Optional connector info for Google Drive files
Returns: Returns:
Document object if successful, None if failed Document object if successful, None if failed
@ -306,29 +473,32 @@ async def add_received_file_document_using_llamacloud(
# Combine all markdown documents into one # Combine all markdown documents into one
file_in_markdown = llamacloud_markdown_document file_in_markdown = llamacloud_markdown_document
# Generate unique identifier hash for this file # Generate unique identifier hash (uses file_id for Google Drive, filename for others)
unique_identifier_hash = generate_unique_identifier_hash( primary_hash, legacy_hash = get_google_drive_unique_identifier(
DocumentType.FILE, file_name, search_space_id connector, file_name, search_space_id
) )
# Generate content hash # Generate content hash
content_hash = generate_content_hash(file_in_markdown, search_space_id) content_hash = generate_content_hash(file_in_markdown, search_space_id)
# Check if document with this unique identifier already exists # Check if document exists (with migration support for Google Drive and content_hash fallback)
existing_document = await check_document_by_unique_identifier( existing_document = await find_existing_document_with_migration(
session, unique_identifier_hash session, primary_hash, legacy_hash, content_hash
) )
if existing_document: if existing_document:
# Document exists - check if content has changed # Handle existing document (rename detection, content change check)
if existing_document.content_hash == content_hash: should_skip, doc = await handle_existing_document_update(
logging.info(f"Document for file {file_name} unchanged. Skipping.") session,
return existing_document existing_document,
else: content_hash,
# Content has changed - update the existing document connector,
logging.info( file_name,
f"Content changed for file {file_name}. Updating document." primary_hash,
) )
if should_skip:
return doc
# Content changed - continue to update
# Get user's long context LLM (needed for both create and update) # Get user's long context LLM (needed for both create and update)
user_llm = await get_user_long_context_llm(session, user_id, search_space_id) user_llm = await get_user_long_context_llm(session, user_id, search_space_id)
@ -380,10 +550,15 @@ async def add_received_file_document_using_llamacloud(
document = existing_document document = existing_document
else: else:
# Create new document # Create new document
# Determine document type based on connector
doc_type = DocumentType.FILE
if connector and connector.get("type") == DocumentType.GOOGLE_DRIVE_FILE:
doc_type = DocumentType.GOOGLE_DRIVE_FILE
document = Document( document = Document(
search_space_id=search_space_id, search_space_id=search_space_id,
title=file_name, title=file_name,
document_type=DocumentType.FILE, document_type=doc_type,
document_metadata={ document_metadata={
"FILE_NAME": file_name, "FILE_NAME": file_name,
"ETL_SERVICE": "LLAMACLOUD", "ETL_SERVICE": "LLAMACLOUD",
@ -392,7 +567,7 @@ async def add_received_file_document_using_llamacloud(
embedding=summary_embedding, embedding=summary_embedding,
chunks=chunks, chunks=chunks,
content_hash=content_hash, content_hash=content_hash,
unique_identifier_hash=unique_identifier_hash, unique_identifier_hash=primary_hash,
blocknote_document=blocknote_json, blocknote_document=blocknote_json,
content_needs_reindexing=False, content_needs_reindexing=False,
updated_at=get_current_timestamp(), updated_at=get_current_timestamp(),
@ -419,6 +594,7 @@ async def add_received_file_document_using_docling(
docling_markdown_document: str, docling_markdown_document: str,
search_space_id: int, search_space_id: int,
user_id: str, user_id: str,
connector: dict | None = None,
) -> Document | None: ) -> Document | None:
""" """
Process and store document content parsed by Docling. Process and store document content parsed by Docling.
@ -429,6 +605,7 @@ async def add_received_file_document_using_docling(
docling_markdown_document: Markdown content from Docling parsing docling_markdown_document: Markdown content from Docling parsing
search_space_id: ID of the search space search_space_id: ID of the search space
user_id: ID of the user user_id: ID of the user
connector: Optional connector info for Google Drive files
Returns: Returns:
Document object if successful, None if failed Document object if successful, None if failed
@ -436,35 +613,38 @@ async def add_received_file_document_using_docling(
try: try:
file_in_markdown = docling_markdown_document file_in_markdown = docling_markdown_document
# Generate unique identifier hash for this file # Generate unique identifier hash (uses file_id for Google Drive, filename for others)
unique_identifier_hash = generate_unique_identifier_hash( primary_hash, legacy_hash = get_google_drive_unique_identifier(
DocumentType.FILE, file_name, search_space_id connector, file_name, search_space_id
) )
# Generate content hash # Generate content hash
content_hash = generate_content_hash(file_in_markdown, search_space_id) content_hash = generate_content_hash(file_in_markdown, search_space_id)
# Check if document with this unique identifier already exists # Check if document exists (with migration support for Google Drive and content_hash fallback)
existing_document = await check_document_by_unique_identifier( existing_document = await find_existing_document_with_migration(
session, unique_identifier_hash session, primary_hash, legacy_hash, content_hash
) )
if existing_document: if existing_document:
# Document exists - check if content has changed # Handle existing document (rename detection, content change check)
if existing_document.content_hash == content_hash: should_skip, doc = await handle_existing_document_update(
logging.info(f"Document for file {file_name} unchanged. Skipping.") session,
return existing_document existing_document,
else: content_hash,
# Content has changed - update the existing document connector,
logging.info( file_name,
f"Content changed for file {file_name}. Updating document." primary_hash,
) )
if should_skip:
return doc
# Content changed - continue to update
# Get user's long context LLM (needed for both create and update) # Get user's long context LLM (needed for both create and update)
user_llm = await get_user_long_context_llm(session, user_id, search_space_id) user_llm = await get_user_long_context_llm(session, user_id, search_space_id)
if not user_llm: if not user_llm:
raise RuntimeError( raise RuntimeError(
f"No long context LLM configured for user {user_id} in search space {search_space_id}" f"No long context LLM configured for user {user_id} in search_space {search_space_id}"
) )
# Generate summary using chunked processing for large documents # Generate summary using chunked processing for large documents
@ -534,10 +714,15 @@ async def add_received_file_document_using_docling(
document = existing_document document = existing_document
else: else:
# Create new document # Create new document
# Determine document type based on connector
doc_type = DocumentType.FILE
if connector and connector.get("type") == DocumentType.GOOGLE_DRIVE_FILE:
doc_type = DocumentType.GOOGLE_DRIVE_FILE
document = Document( document = Document(
search_space_id=search_space_id, search_space_id=search_space_id,
title=file_name, title=file_name,
document_type=DocumentType.FILE, document_type=doc_type,
document_metadata={ document_metadata={
"FILE_NAME": file_name, "FILE_NAME": file_name,
"ETL_SERVICE": "DOCLING", "ETL_SERVICE": "DOCLING",
@ -546,15 +731,15 @@ async def add_received_file_document_using_docling(
embedding=summary_embedding, embedding=summary_embedding,
chunks=chunks, chunks=chunks,
content_hash=content_hash, content_hash=content_hash,
unique_identifier_hash=unique_identifier_hash, unique_identifier_hash=primary_hash,
blocknote_document=blocknote_json, blocknote_document=blocknote_json,
content_needs_reindexing=False, content_needs_reindexing=False,
updated_at=get_current_timestamp(), updated_at=get_current_timestamp(),
) )
session.add(document) session.add(document)
await session.commit() await session.commit()
await session.refresh(document) await session.refresh(document)
return document return document
except SQLAlchemyError as db_error: except SQLAlchemyError as db_error:
@ -650,7 +835,7 @@ async def process_file_in_background(
# Process markdown directly through specialized function # Process markdown directly through specialized function
result = await add_received_markdown_file_document( result = await add_received_markdown_file_document(
session, filename, markdown_content, search_space_id, user_id session, filename, markdown_content, search_space_id, user_id, connector
) )
if connector: if connector:
@ -790,7 +975,7 @@ async def process_file_in_background(
# Process transcription as markdown document # Process transcription as markdown document
result = await add_received_markdown_file_document( result = await add_received_markdown_file_document(
session, filename, transcribed_text, search_space_id, user_id session, filename, transcribed_text, search_space_id, user_id, connector
) )
if connector: if connector:
@ -955,7 +1140,7 @@ async def process_file_in_background(
# Pass the documents to the existing background task # Pass the documents to the existing background task
result = await add_received_file_document_using_unstructured( result = await add_received_file_document_using_unstructured(
session, filename, docs, search_space_id, user_id session, filename, docs, search_space_id, user_id, connector
) )
if connector: if connector:
@ -1103,6 +1288,7 @@ async def process_file_in_background(
llamacloud_markdown_document=markdown_content, llamacloud_markdown_document=markdown_content,
search_space_id=search_space_id, search_space_id=search_space_id,
user_id=user_id, user_id=user_id,
connector=connector,
) )
# Track if this document was successfully created # Track if this document was successfully created
@ -1256,6 +1442,7 @@ async def process_file_in_background(
docling_markdown_document=result["content"], docling_markdown_document=result["content"],
search_space_id=search_space_id, search_space_id=search_space_id,
user_id=user_id, user_id=user_id,
connector=connector,
) )
if doc_result: if doc_result:

View file

@ -19,16 +19,157 @@ from app.utils.document_converters import (
from .base import ( from .base import (
check_document_by_unique_identifier, check_document_by_unique_identifier,
check_duplicate_document,
get_current_timestamp, get_current_timestamp,
) )
def _get_google_drive_unique_identifier(
connector: dict | None,
filename: str,
search_space_id: int,
) -> tuple[str, str | None]:
"""
Get unique identifier hash for a file, with special handling for Google Drive.
For Google Drive files, uses file_id as the unique identifier (doesn't change on rename).
For other files, uses filename.
Args:
connector: Optional connector info dict with type and metadata
filename: The filename (used for non-Google Drive files or as fallback)
search_space_id: The search space ID
Returns:
Tuple of (primary_hash, legacy_hash or None)
"""
if connector and connector.get("type") == DocumentType.GOOGLE_DRIVE_FILE:
metadata = connector.get("metadata", {})
file_id = metadata.get("google_drive_file_id")
if file_id:
primary_hash = generate_unique_identifier_hash(
DocumentType.GOOGLE_DRIVE_FILE, file_id, search_space_id
)
legacy_hash = generate_unique_identifier_hash(
DocumentType.GOOGLE_DRIVE_FILE, filename, search_space_id
)
return primary_hash, legacy_hash
primary_hash = generate_unique_identifier_hash(
DocumentType.FILE, filename, search_space_id
)
return primary_hash, None
async def _find_existing_document_with_migration(
session: AsyncSession,
primary_hash: str,
legacy_hash: str | None,
content_hash: str | None = None,
) -> Document | None:
"""
Find existing document, checking both new hash and legacy hash for migration,
with fallback to content_hash for cross-source deduplication.
"""
existing_document = await check_document_by_unique_identifier(session, primary_hash)
if not existing_document and legacy_hash:
existing_document = await check_document_by_unique_identifier(
session, legacy_hash
)
if existing_document:
logging.info(
"Found legacy document (filename-based hash), will migrate to file_id-based hash"
)
# Fallback: check by content_hash to catch duplicates from different sources
if not existing_document and content_hash:
existing_document = await check_duplicate_document(session, content_hash)
if existing_document:
logging.info(
f"Found duplicate content from different source (content_hash match). "
f"Original document ID: {existing_document.id}, type: {existing_document.document_type}"
)
return existing_document
async def _handle_existing_document_update(
session: AsyncSession,
existing_document: Document,
content_hash: str,
connector: dict | None,
filename: str,
primary_hash: str,
task_logger: TaskLoggingService,
log_entry,
) -> tuple[bool, Document | None]:
"""
Handle update logic for an existing document.
Returns:
Tuple of (should_skip_processing, document_to_return)
"""
# Check if this document needs hash migration
if existing_document.unique_identifier_hash != primary_hash:
existing_document.unique_identifier_hash = primary_hash
logging.info(f"Migrated document to file_id-based identifier: {filename}")
# Check if content has changed
if existing_document.content_hash == content_hash:
# Content unchanged - check if we need to update metadata (e.g., filename changed)
if connector and connector.get("type") == DocumentType.GOOGLE_DRIVE_FILE:
connector_metadata = connector.get("metadata", {})
new_name = connector_metadata.get("google_drive_file_name")
# Check both possible keys for old name (FILE_NAME is used in stored documents)
doc_metadata = existing_document.document_metadata or {}
old_name = (
doc_metadata.get("FILE_NAME")
or doc_metadata.get("google_drive_file_name")
or doc_metadata.get("file_name")
)
if new_name and old_name and old_name != new_name:
# File was renamed - update title and metadata, skip expensive processing
from sqlalchemy.orm.attributes import flag_modified
existing_document.title = new_name
if not existing_document.document_metadata:
existing_document.document_metadata = {}
existing_document.document_metadata["FILE_NAME"] = new_name
existing_document.document_metadata["file_name"] = new_name
existing_document.document_metadata["google_drive_file_name"] = new_name
flag_modified(existing_document, "document_metadata")
await session.commit()
logging.info(
f"File renamed in Google Drive: '{old_name}''{new_name}' (no re-processing needed)"
)
await task_logger.log_task_success(
log_entry,
f"Markdown file document unchanged: {filename}",
{
"duplicate_detected": True,
"existing_document_id": existing_document.id,
},
)
logging.info(f"Document for markdown file {filename} unchanged. Skipping.")
return True, existing_document
else:
logging.info(
f"Content changed for markdown file {filename}. Updating document."
)
return False, None
async def add_received_markdown_file_document( async def add_received_markdown_file_document(
session: AsyncSession, session: AsyncSession,
file_name: str, file_name: str,
file_in_markdown: str, file_in_markdown: str,
search_space_id: int, search_space_id: int,
user_id: str, user_id: str,
connector: dict | None = None,
) -> Document | None: ) -> Document | None:
""" """
Process and store a markdown file document. Process and store a markdown file document.
@ -39,6 +180,7 @@ async def add_received_markdown_file_document(
file_in_markdown: Content of the markdown file file_in_markdown: Content of the markdown file
search_space_id: ID of the search space search_space_id: ID of the search space
user_id: ID of the user user_id: ID of the user
connector: Optional connector info for Google Drive files
Returns: Returns:
Document object if successful, None if failed Document object if successful, None if failed
@ -58,39 +200,34 @@ async def add_received_markdown_file_document(
) )
try: try:
# Generate unique identifier hash for this markdown file # Generate unique identifier hash (uses file_id for Google Drive, filename for others)
unique_identifier_hash = generate_unique_identifier_hash( primary_hash, legacy_hash = _get_google_drive_unique_identifier(
DocumentType.FILE, file_name, search_space_id connector, file_name, search_space_id
) )
# Generate content hash # Generate content hash
content_hash = generate_content_hash(file_in_markdown, search_space_id) content_hash = generate_content_hash(file_in_markdown, search_space_id)
# Check if document with this unique identifier already exists # Check if document exists (with migration support for Google Drive and content_hash fallback)
existing_document = await check_document_by_unique_identifier( existing_document = await _find_existing_document_with_migration(
session, unique_identifier_hash session, primary_hash, legacy_hash, content_hash
) )
if existing_document: if existing_document:
# Document exists - check if content has changed # Handle existing document (rename detection, content change check)
if existing_document.content_hash == content_hash: should_skip, doc = await _handle_existing_document_update(
await task_logger.log_task_success( session,
log_entry, existing_document,
f"Markdown file document unchanged: {file_name}", content_hash,
{ connector,
"duplicate_detected": True, file_name,
"existing_document_id": existing_document.id, primary_hash,
}, task_logger,
) log_entry,
logging.info( )
f"Document for markdown file {file_name} unchanged. Skipping." if should_skip:
) return doc
return existing_document # Content changed - continue to update
else:
# Content has changed - update the existing document
logging.info(
f"Content changed for markdown file {file_name}. Updating document."
)
# Get user's long context LLM (needed for both create and update) # Get user's long context LLM (needed for both create and update)
user_llm = await get_user_long_context_llm(session, user_id, search_space_id) user_llm = await get_user_long_context_llm(session, user_id, search_space_id)
@ -139,10 +276,15 @@ async def add_received_markdown_file_document(
document = existing_document document = existing_document
else: else:
# Create new document # Create new document
# Determine document type based on connector
doc_type = DocumentType.FILE
if connector and connector.get("type") == DocumentType.GOOGLE_DRIVE_FILE:
doc_type = DocumentType.GOOGLE_DRIVE_FILE
document = Document( document = Document(
search_space_id=search_space_id, search_space_id=search_space_id,
title=file_name, title=file_name,
document_type=DocumentType.FILE, document_type=doc_type,
document_metadata={ document_metadata={
"FILE_NAME": file_name, "FILE_NAME": file_name,
}, },
@ -150,7 +292,7 @@ async def add_received_markdown_file_document(
embedding=summary_embedding, embedding=summary_embedding,
chunks=chunks, chunks=chunks,
content_hash=content_hash, content_hash=content_hash,
unique_identifier_hash=unique_identifier_hash, unique_identifier_hash=primary_hash,
blocknote_document=blocknote_json, blocknote_document=blocknote_json,
updated_at=get_current_timestamp(), updated_at=get_current_timestamp(),
) )

View file

@ -1,6 +1,7 @@
"use client"; "use client";
import type { FC } from "react"; import type { FC } from "react";
import { AlertCircle } from "lucide-react";
import { Label } from "@/components/ui/label"; import { Label } from "@/components/ui/label";
import { import {
Select, Select,
@ -16,6 +17,8 @@ interface PeriodicSyncConfigProps {
frequencyMinutes: string; frequencyMinutes: string;
onEnabledChange: (enabled: boolean) => void; onEnabledChange: (enabled: boolean) => void;
onFrequencyChange: (frequency: string) => void; onFrequencyChange: (frequency: string) => void;
disabled?: boolean;
disabledMessage?: string;
} }
export const PeriodicSyncConfig: FC<PeriodicSyncConfigProps> = ({ export const PeriodicSyncConfig: FC<PeriodicSyncConfigProps> = ({
@ -23,6 +26,8 @@ export const PeriodicSyncConfig: FC<PeriodicSyncConfigProps> = ({
frequencyMinutes, frequencyMinutes,
onEnabledChange, onEnabledChange,
onFrequencyChange, onFrequencyChange,
disabled = false,
disabledMessage,
}) => { }) => {
return ( return (
<div className="rounded-xl bg-slate-400/5 dark:bg-white/5 p-3 sm:p-6"> <div className="rounded-xl bg-slate-400/5 dark:bg-white/5 p-3 sm:p-6">
@ -33,9 +38,17 @@ export const PeriodicSyncConfig: FC<PeriodicSyncConfigProps> = ({
Automatically re-index at regular intervals Automatically re-index at regular intervals
</p> </p>
</div> </div>
<Switch checked={enabled} onCheckedChange={onEnabledChange} /> <Switch checked={enabled} onCheckedChange={onEnabledChange} disabled={disabled} />
</div> </div>
{/* Show disabled message when periodic sync can't be enabled */}
{disabled && disabledMessage && (
<div className="mt-3 flex items-start gap-2 text-amber-600 dark:text-amber-400">
<AlertCircle className="size-4 mt-0.5 shrink-0" />
<p className="text-xs sm:text-sm">{disabledMessage}</p>
</div>
)}
{enabled && ( {enabled && (
<div className="mt-4 pt-4 border-t border-slate-400/20 space-y-3"> <div className="mt-4 pt-4 border-t border-slate-400/20 space-y-3">
<div className="space-y-2"> <div className="space-y-2">

View file

@ -1,11 +1,19 @@
"use client"; "use client";
import { Info } from "lucide-react"; import { File, FileText, FileSpreadsheet, FolderClosed, Image, Presentation } from "lucide-react";
import type { FC } from "react"; import type { FC } from "react";
import { useEffect, useState } from "react"; import { useEffect, useState } from "react";
import { GoogleDriveFolderTree } from "@/components/connectors/google-drive-folder-tree"; import { GoogleDriveFolderTree } from "@/components/connectors/google-drive-folder-tree";
import { Alert, AlertDescription } from "@/components/ui/alert";
import { Button } from "@/components/ui/button"; import { Button } from "@/components/ui/button";
import { Label } from "@/components/ui/label";
import {
Select,
SelectContent,
SelectItem,
SelectTrigger,
SelectValue,
} from "@/components/ui/select";
import { Switch } from "@/components/ui/switch";
import type { ConnectorConfigProps } from "../index"; import type { ConnectorConfigProps } from "../index";
interface SelectedFolder { interface SelectedFolder {
@ -13,128 +21,292 @@ interface SelectedFolder {
name: string; name: string;
} }
interface IndexingOptions {
max_files_per_folder: number;
incremental_sync: boolean;
include_subfolders: boolean;
}
const DEFAULT_INDEXING_OPTIONS: IndexingOptions = {
max_files_per_folder: 100,
incremental_sync: true,
include_subfolders: true,
};
// Helper to get appropriate icon for file type based on file name
function getFileIconFromName(fileName: string, className: string = "size-3.5 shrink-0") {
const lowerName = fileName.toLowerCase();
// Spreadsheets
if (
lowerName.endsWith(".xlsx") ||
lowerName.endsWith(".xls") ||
lowerName.endsWith(".csv") ||
lowerName.includes("spreadsheet")
) {
return <FileSpreadsheet className={`${className} text-green-500`} />;
}
// Presentations
if (
lowerName.endsWith(".pptx") ||
lowerName.endsWith(".ppt") ||
lowerName.includes("presentation")
) {
return <Presentation className={`${className} text-orange-500`} />;
}
// Documents (word, text only - not PDF)
if (
lowerName.endsWith(".docx") ||
lowerName.endsWith(".doc") ||
lowerName.endsWith(".txt") ||
lowerName.includes("document") ||
lowerName.includes("word") ||
lowerName.includes("text")
) {
return <FileText className={`${className} text-gray-500`} />;
}
// Images
if (
lowerName.endsWith(".png") ||
lowerName.endsWith(".jpg") ||
lowerName.endsWith(".jpeg") ||
lowerName.endsWith(".gif") ||
lowerName.endsWith(".webp") ||
lowerName.endsWith(".svg")
) {
return <Image className={`${className} text-purple-500`} />;
}
// Default (including PDF)
return <File className={`${className} text-gray-500`} />;
}
export const GoogleDriveConfig: FC<ConnectorConfigProps> = ({ connector, onConfigChange }) => { export const GoogleDriveConfig: FC<ConnectorConfigProps> = ({ connector, onConfigChange }) => {
// Initialize with existing selected folders and files from connector config // Initialize with existing selected folders and files from connector config
const existingFolders = const existingFolders =
(connector.config?.selected_folders as SelectedFolder[] | undefined) || []; (connector.config?.selected_folders as SelectedFolder[] | undefined) || [];
const existingFiles = (connector.config?.selected_files as SelectedFolder[] | undefined) || []; const existingFiles = (connector.config?.selected_files as SelectedFolder[] | undefined) || [];
const existingIndexingOptions =
(connector.config?.indexing_options as IndexingOptions | undefined) || DEFAULT_INDEXING_OPTIONS;
const [selectedFolders, setSelectedFolders] = useState<SelectedFolder[]>(existingFolders); const [selectedFolders, setSelectedFolders] = useState<SelectedFolder[]>(existingFolders);
const [selectedFiles, setSelectedFiles] = useState<SelectedFolder[]>(existingFiles); const [selectedFiles, setSelectedFiles] = useState<SelectedFolder[]>(existingFiles);
const [showFolderSelector, setShowFolderSelector] = useState(false); const [showFolderSelector, setShowFolderSelector] = useState(false);
const [indexingOptions, setIndexingOptions] = useState<IndexingOptions>(existingIndexingOptions);
// Update selected folders and files when connector config changes // Update selected folders and files when connector config changes
useEffect(() => { useEffect(() => {
const folders = (connector.config?.selected_folders as SelectedFolder[] | undefined) || []; const folders = (connector.config?.selected_folders as SelectedFolder[] | undefined) || [];
const files = (connector.config?.selected_files as SelectedFolder[] | undefined) || []; const files = (connector.config?.selected_files as SelectedFolder[] | undefined) || [];
const options =
(connector.config?.indexing_options as IndexingOptions | undefined) ||
DEFAULT_INDEXING_OPTIONS;
setSelectedFolders(folders); setSelectedFolders(folders);
setSelectedFiles(files); setSelectedFiles(files);
setIndexingOptions(options);
}, [connector.config]); }, [connector.config]);
const handleSelectFolders = (folders: SelectedFolder[]) => { const updateConfig = (
setSelectedFolders(folders); folders: SelectedFolder[],
files: SelectedFolder[],
options: IndexingOptions
) => {
if (onConfigChange) { if (onConfigChange) {
// Store folder IDs and names in config for indexing
onConfigChange({ onConfigChange({
...connector.config, ...connector.config,
selected_folders: folders, selected_folders: folders,
selected_files: selectedFiles, // Preserve existing files selected_files: files,
indexing_options: options,
}); });
} }
}; };
const handleSelectFolders = (folders: SelectedFolder[]) => {
setSelectedFolders(folders);
updateConfig(folders, selectedFiles, indexingOptions);
};
const handleSelectFiles = (files: SelectedFolder[]) => { const handleSelectFiles = (files: SelectedFolder[]) => {
setSelectedFiles(files); setSelectedFiles(files);
if (onConfigChange) { updateConfig(selectedFolders, files, indexingOptions);
// Store file IDs and names in config for indexing };
onConfigChange({
...connector.config, const handleIndexingOptionChange = (key: keyof IndexingOptions, value: number | boolean) => {
selected_folders: selectedFolders, // Preserve existing folders const newOptions = { ...indexingOptions, [key]: value };
selected_files: files, setIndexingOptions(newOptions);
}); updateConfig(selectedFolders, selectedFiles, newOptions);
}
}; };
const totalSelected = selectedFolders.length + selectedFiles.length; const totalSelected = selectedFolders.length + selectedFiles.length;
return ( return (
<div className="rounded-xl border border-border bg-slate-400/5 dark:bg-white/5 p-3 sm:p-6 space-y-3 sm:space-y-4"> <div className="space-y-4">
<div className="space-y-1 sm:space-y-2"> {/* Folder & File Selection */}
<h3 className="font-medium text-sm sm:text-base">Folder & File Selection</h3> <div className="rounded-xl border border-border bg-slate-400/5 dark:bg-white/5 p-3 sm:p-6 space-y-3 sm:space-y-4">
<p className="text-xs sm:text-sm text-muted-foreground"> <div className="space-y-1 sm:space-y-2">
Select specific folders and/or individual files to index. Only files directly in each <h3 className="font-medium text-sm sm:text-base">Folder & File Selection</h3>
folder will be processedsubfolders must be selected separately. <p className="text-xs sm:text-sm text-muted-foreground">
</p> Select specific folders and/or individual files to index.
</div>
{totalSelected > 0 && (
<div className="p-2 sm:p-3 bg-muted rounded-lg text-xs sm:text-sm space-y-1 sm:space-y-2">
<p className="font-medium">
Selected {totalSelected} item{totalSelected > 1 ? "s" : ""}:
{selectedFolders.length > 0 &&
` ${selectedFolders.length} folder${selectedFolders.length > 1 ? "s" : ""}`}
{selectedFiles.length > 0 &&
` ${selectedFiles.length} file${selectedFiles.length > 1 ? "s" : ""}`}
</p> </p>
<div className="max-h-20 sm:max-h-24 overflow-y-auto space-y-1">
{selectedFolders.map((folder) => (
<p
key={folder.id}
className="text-xs sm:text-sm text-muted-foreground truncate"
title={folder.name}
>
📁 {folder.name}
</p>
))}
{selectedFiles.map((file) => (
<p
key={file.id}
className="text-xs sm:text-sm text-muted-foreground truncate"
title={file.name}
>
📄 {file.name}
</p>
))}
</div>
</div> </div>
)}
{showFolderSelector ? ( {totalSelected > 0 && (
<div className="space-y-2 sm:space-y-3"> <div className="p-2 sm:p-3 bg-muted rounded-lg text-xs sm:text-sm space-y-1 sm:space-y-2">
<GoogleDriveFolderTree <p className="font-medium">
connectorId={connector.id} Selected {totalSelected} item{totalSelected > 1 ? "s" : ""}: {(() => {
selectedFolders={selectedFolders} const parts: string[] = [];
onSelectFolders={handleSelectFolders} if (selectedFolders.length > 0) {
selectedFiles={selectedFiles} parts.push(
onSelectFiles={handleSelectFiles} `${selectedFolders.length} folder${selectedFolders.length > 1 ? "s" : ""}`
/> );
}
if (selectedFiles.length > 0) {
parts.push(`${selectedFiles.length} file${selectedFiles.length > 1 ? "s" : ""}`);
}
return parts.length > 0 ? `(${parts.join(" ")})` : "";
})()}
</p>
<div className="max-h-20 sm:max-h-24 overflow-y-auto space-y-1">
{selectedFolders.map((folder) => (
<p
key={folder.id}
className="text-xs sm:text-sm text-muted-foreground truncate flex items-center gap-1.5"
title={folder.name}
>
<FolderClosed className="size-3.5 shrink-0 text-gray-500" />
{folder.name}
</p>
))}
{selectedFiles.map((file) => (
<p
key={file.id}
className="text-xs sm:text-sm text-muted-foreground truncate flex items-center gap-1.5"
title={file.name}
>
{getFileIconFromName(file.name)}
{file.name}
</p>
))}
</div>
</div>
)}
{showFolderSelector ? (
<div className="space-y-2 sm:space-y-3">
<GoogleDriveFolderTree
connectorId={connector.id}
selectedFolders={selectedFolders}
onSelectFolders={handleSelectFolders}
selectedFiles={selectedFiles}
onSelectFiles={handleSelectFiles}
/>
<Button
type="button"
variant="outline"
size="sm"
onClick={() => setShowFolderSelector(false)}
className="bg-slate-400/5 dark:bg-white/5 border-slate-400/20 hover:bg-slate-400/10 dark:hover:bg-white/10 text-xs sm:text-sm h-8 sm:h-9"
>
Done Selecting
</Button>
</div>
) : (
<Button <Button
type="button" type="button"
variant="outline" variant="outline"
size="sm" onClick={() => setShowFolderSelector(true)}
onClick={() => setShowFolderSelector(false)}
className="bg-slate-400/5 dark:bg-white/5 border-slate-400/20 hover:bg-slate-400/10 dark:hover:bg-white/10 text-xs sm:text-sm h-8 sm:h-9" className="bg-slate-400/5 dark:bg-white/5 border-slate-400/20 hover:bg-slate-400/10 dark:hover:bg-white/10 text-xs sm:text-sm h-8 sm:h-9"
> >
Done Selecting {totalSelected > 0 ? "Change Selection" : "Select Folders & Files"}
</Button> </Button>
</div> )}
) : ( </div>
<Button
type="button"
variant="outline"
onClick={() => setShowFolderSelector(true)}
className="bg-slate-400/5 dark:bg-white/5 border-slate-400/20 hover:bg-slate-400/10 dark:hover:bg-white/10 text-xs sm:text-sm h-8 sm:h-9"
>
{totalSelected > 0 ? "Change Selection" : "Select Folders & Files"}
</Button>
)}
<Alert className="bg-slate-400/5 dark:bg-white/5 border-slate-400/20 p-2 sm:p-3 flex items-center gap-2 [&>svg]:relative [&>svg]:left-0 [&>svg]:top-0 [&>svg+div]:translate-y-0"> {/* Indexing Options */}
<Info className="h-3 w-3 sm:h-4 sm:w-4 shrink-0" /> <div className="rounded-xl border border-border bg-slate-400/5 dark:bg-white/5 p-3 sm:p-6 space-y-4">
<AlertDescription className="text-[10px] sm:text-xs !pl-0"> <div className="space-y-1 sm:space-y-2">
Folder and file selection is used when indexing. You can change this selection when you <h3 className="font-medium text-sm sm:text-base">Indexing Options</h3>
start indexing. <p className="text-xs sm:text-sm text-muted-foreground">
</AlertDescription> Configure how files are indexed from your Google Drive.
</Alert> </p>
</div>
{/* Max files per folder */}
<div className="space-y-2">
<div className="flex items-center justify-between">
<div className="space-y-0.5">
<Label htmlFor="max-files" className="text-sm font-medium">
Max files per folder
</Label>
<p className="text-xs text-muted-foreground">
Maximum number of files to index from each folder
</p>
</div>
<Select
value={indexingOptions.max_files_per_folder.toString()}
onValueChange={(value) =>
handleIndexingOptionChange("max_files_per_folder", parseInt(value, 10))
}
>
<SelectTrigger
id="max-files"
className="w-[140px] bg-slate-400/5 dark:bg-slate-400/5 border-slate-400/20 text-xs sm:text-sm"
>
<SelectValue placeholder="Select limit" />
</SelectTrigger>
<SelectContent className="z-[100]">
<SelectItem value="50" className="text-xs sm:text-sm">
50 files
</SelectItem>
<SelectItem value="100" className="text-xs sm:text-sm">
100 files
</SelectItem>
<SelectItem value="250" className="text-xs sm:text-sm">
250 files
</SelectItem>
<SelectItem value="500" className="text-xs sm:text-sm">
500 files
</SelectItem>
<SelectItem value="1000" className="text-xs sm:text-sm">
1000 files
</SelectItem>
</SelectContent>
</Select>
</div>
</div>
{/* Incremental sync toggle */}
<div className="flex items-center justify-between pt-2 border-t border-slate-400/20">
<div className="space-y-0.5">
<Label htmlFor="incremental-sync" className="text-sm font-medium">
Incremental sync
</Label>
<p className="text-xs text-muted-foreground">
Only sync changes since last index (faster). Disable for a full re-index.
</p>
</div>
<Switch
id="incremental-sync"
checked={indexingOptions.incremental_sync}
onCheckedChange={(checked) => handleIndexingOptionChange("incremental_sync", checked)}
/>
</div>
{/* Include subfolders toggle */}
<div className="flex items-center justify-between pt-2 border-t border-slate-400/20">
<div className="space-y-0.5">
<Label htmlFor="include-subfolders" className="text-sm font-medium">
Include subfolders
</Label>
<p className="text-xs text-muted-foreground">
Recursively index files in subfolders of selected folders
</p>
</div>
<Switch
id="include-subfolders"
checked={indexingOptions.include_subfolders}
onCheckedChange={(checked) => handleIndexingOptionChange("include_subfolders", checked)}
/>
</div>
</div>
</div> </div>
); );
}; };

View file

@ -222,15 +222,36 @@ export const ConnectorEditView: FC<ConnectorEditViewProps> = ({
/> />
)} )}
{/* Periodic sync - not shown for Google Drive */} {/* Periodic sync - shown for all indexable connectors */}
{connector.connector_type !== "GOOGLE_DRIVE_CONNECTOR" && ( {(() => {
<PeriodicSyncConfig // Check if Google Drive has folders/files selected
enabled={periodicEnabled} const isGoogleDrive = connector.connector_type === "GOOGLE_DRIVE_CONNECTOR";
frequencyMinutes={frequencyMinutes} const selectedFolders =
onEnabledChange={onPeriodicEnabledChange} (connector.config?.selected_folders as
onFrequencyChange={onFrequencyChange} | Array<{ id: string; name: string }>
/> | undefined) || [];
)} const selectedFiles =
(connector.config?.selected_files as
| Array<{ id: string; name: string }>
| undefined) || [];
const hasItemsSelected = selectedFolders.length > 0 || selectedFiles.length > 0;
const isDisabled = isGoogleDrive && !hasItemsSelected;
return (
<PeriodicSyncConfig
enabled={periodicEnabled}
frequencyMinutes={frequencyMinutes}
onEnabledChange={onPeriodicEnabledChange}
onFrequencyChange={onFrequencyChange}
disabled={isDisabled}
disabledMessage={
isDisabled
? "Select at least one folder or file above to enable periodic sync"
: undefined
}
/>
);
})()}
</> </>
)} )}

View file

@ -219,11 +219,9 @@ export const useConnectorDialog = () => {
setEditingConnector(connector); setEditingConnector(connector);
setConnectorConfig(connector.config); setConnectorConfig(connector.config);
setConnectorName(connector.name); setConnectorName(connector.name);
// Load existing periodic sync settings (disabled for Google Drive and non-indexable connectors) // Load existing periodic sync settings (disabled for non-indexable connectors)
setPeriodicEnabled( setPeriodicEnabled(
connector.connector_type === "GOOGLE_DRIVE_CONNECTOR" || !connector.is_indexable !connector.is_indexable ? false : connector.periodic_indexing_enabled
? false
: connector.periodic_indexing_enabled
); );
setFrequencyMinutes(connector.indexing_frequency_minutes?.toString() || "1440"); setFrequencyMinutes(connector.indexing_frequency_minutes?.toString() || "1440");
// Reset dates - user can set new ones for re-indexing // Reset dates - user can set new ones for re-indexing
@ -882,20 +880,14 @@ export const useConnectorDialog = () => {
const endDateStr = endDate ? format(endDate, "yyyy-MM-dd") : undefined; const endDateStr = endDate ? format(endDate, "yyyy-MM-dd") : undefined;
// Update connector with periodic sync settings and config changes // Update connector with periodic sync settings and config changes
// Note: Periodic sync is disabled for Google Drive connectors
if (periodicEnabled || indexingConnectorConfig) { if (periodicEnabled || indexingConnectorConfig) {
const frequency = periodicEnabled ? parseInt(frequencyMinutes, 10) : undefined; const frequency = periodicEnabled ? parseInt(frequencyMinutes, 10) : undefined;
await updateConnector({ await updateConnector({
id: indexingConfig.connectorId, id: indexingConfig.connectorId,
data: { data: {
...(periodicEnabled && ...(periodicEnabled && {
indexingConfig.connectorType !== "GOOGLE_DRIVE_CONNECTOR" && { periodic_indexing_enabled: true,
periodic_indexing_enabled: true, indexing_frequency_minutes: frequency,
indexing_frequency_minutes: frequency,
}),
...(indexingConfig.connectorType === "GOOGLE_DRIVE_CONNECTOR" && {
periodic_indexing_enabled: false,
indexing_frequency_minutes: null,
}), }),
...(indexingConnectorConfig && { ...(indexingConnectorConfig && {
config: indexingConnectorConfig, config: indexingConnectorConfig,
@ -912,11 +904,18 @@ export const useConnectorDialog = () => {
const selectedFiles = indexingConnectorConfig.selected_files as const selectedFiles = indexingConnectorConfig.selected_files as
| Array<{ id: string; name: string }> | Array<{ id: string; name: string }>
| undefined; | undefined;
const indexingOptions = indexingConnectorConfig.indexing_options as
| {
max_files_per_folder: number;
incremental_sync: boolean;
include_subfolders: boolean;
}
| undefined;
if ( if (
(selectedFolders && selectedFolders.length > 0) || (selectedFolders && selectedFolders.length > 0) ||
(selectedFiles && selectedFiles.length > 0) (selectedFiles && selectedFiles.length > 0)
) { ) {
// Index with folder/file selection // Index with folder/file selection and indexing options
await indexConnector({ await indexConnector({
connector_id: indexingConfig.connectorId, connector_id: indexingConfig.connectorId,
queryParams: { queryParams: {
@ -925,6 +924,11 @@ export const useConnectorDialog = () => {
body: { body: {
folders: selectedFolders || [], folders: selectedFolders || [],
files: selectedFiles || [], files: selectedFiles || [],
indexing_options: indexingOptions || {
max_files_per_folder: 100,
incremental_sync: true,
include_subfolders: true,
},
}, },
}); });
} else { } else {
@ -964,7 +968,7 @@ export const useConnectorDialog = () => {
); );
// Track periodic indexing started if enabled // Track periodic indexing started if enabled
if (periodicEnabled && indexingConfig.connectorType !== "GOOGLE_DRIVE_CONNECTOR") { if (periodicEnabled) {
trackPeriodicIndexingStarted( trackPeriodicIndexingStarted(
Number(searchSpaceId), Number(searchSpaceId),
indexingConfig.connectorType, indexingConfig.connectorType,
@ -1072,12 +1076,8 @@ export const useConnectorDialog = () => {
setEditingConnector(connector); setEditingConnector(connector);
setConnectorName(connector.name); setConnectorName(connector.name);
// Load existing periodic sync settings (disabled for Google Drive and non-indexable connectors) // Load existing periodic sync settings (disabled for non-indexable connectors)
setPeriodicEnabled( setPeriodicEnabled(!connector.is_indexable ? false : connector.periodic_indexing_enabled);
connector.connector_type === "GOOGLE_DRIVE_CONNECTOR" || !connector.is_indexable
? false
: connector.periodic_indexing_enabled
);
setFrequencyMinutes(connector.indexing_frequency_minutes?.toString() || "1440"); setFrequencyMinutes(connector.indexing_frequency_minutes?.toString() || "1440");
// Reset dates - user can set new ones for re-indexing // Reset dates - user can set new ones for re-indexing
setStartDate(undefined); setStartDate(undefined);
@ -1117,6 +1117,24 @@ export const useConnectorDialog = () => {
return; return;
} }
// Prevent periodic indexing for Google Drive without folders/files selected
if (periodicEnabled && editingConnector.connector_type === "GOOGLE_DRIVE_CONNECTOR") {
const selectedFolders = (connectorConfig || editingConnector.config)?.selected_folders as
| Array<{ id: string; name: string }>
| undefined;
const selectedFiles = (connectorConfig || editingConnector.config)?.selected_files as
| Array<{ id: string; name: string }>
| undefined;
const hasItemsSelected =
(selectedFolders && selectedFolders.length > 0) ||
(selectedFiles && selectedFiles.length > 0);
if (!hasItemsSelected) {
toast.error("Select at least one folder or file to enable periodic sync");
return;
}
}
// Validate frequency minutes if periodic is enabled (only for indexable connectors) // Validate frequency minutes if periodic is enabled (only for indexable connectors)
if (periodicEnabled && editingConnector.is_indexable) { if (periodicEnabled && editingConnector.is_indexable) {
const frequencyValidation = frequencyMinutesSchema.safeParse(frequencyMinutes); const frequencyValidation = frequencyMinutesSchema.safeParse(frequencyMinutes);
@ -1132,23 +1150,14 @@ export const useConnectorDialog = () => {
const endDateStr = endDate ? format(endDate, "yyyy-MM-dd") : undefined; const endDateStr = endDate ? format(endDate, "yyyy-MM-dd") : undefined;
// Update connector with periodic sync settings, config changes, and name // Update connector with periodic sync settings, config changes, and name
// Note: Periodic sync is disabled for Google Drive connectors and non-indexable connectors
const frequency = const frequency =
periodicEnabled && editingConnector.is_indexable ? parseInt(frequencyMinutes, 10) : null; periodicEnabled && editingConnector.is_indexable ? parseInt(frequencyMinutes, 10) : null;
await updateConnector({ await updateConnector({
id: editingConnector.id, id: editingConnector.id,
data: { data: {
name: connectorName || editingConnector.name, name: connectorName || editingConnector.name,
periodic_indexing_enabled: periodic_indexing_enabled: !editingConnector.is_indexable ? false : periodicEnabled,
editingConnector.connector_type === "GOOGLE_DRIVE_CONNECTOR" || indexing_frequency_minutes: !editingConnector.is_indexable ? null : frequency,
!editingConnector.is_indexable
? false
: periodicEnabled,
indexing_frequency_minutes:
editingConnector.connector_type === "GOOGLE_DRIVE_CONNECTOR" ||
!editingConnector.is_indexable
? null
: frequency,
config: connectorConfig || editingConnector.config, config: connectorConfig || editingConnector.config,
}, },
}); });
@ -1166,6 +1175,13 @@ export const useConnectorDialog = () => {
const selectedFiles = (connectorConfig || editingConnector.config)?.selected_files as const selectedFiles = (connectorConfig || editingConnector.config)?.selected_files as
| Array<{ id: string; name: string }> | Array<{ id: string; name: string }>
| undefined; | undefined;
const indexingOptions = (connectorConfig || editingConnector.config)?.indexing_options as
| {
max_files_per_folder: number;
incremental_sync: boolean;
include_subfolders: boolean;
}
| undefined;
if ( if (
(selectedFolders && selectedFolders.length > 0) || (selectedFolders && selectedFolders.length > 0) ||
(selectedFiles && selectedFiles.length > 0) (selectedFiles && selectedFiles.length > 0)
@ -1178,6 +1194,11 @@ export const useConnectorDialog = () => {
body: { body: {
folders: selectedFolders || [], folders: selectedFolders || [],
files: selectedFiles || [], files: selectedFiles || [],
indexing_options: indexingOptions || {
max_files_per_folder: 100,
incremental_sync: true,
include_subfolders: true,
},
}, },
}); });
const totalItems = (selectedFolders?.length || 0) + (selectedFiles?.length || 0); const totalItems = (selectedFolders?.length || 0) + (selectedFiles?.length || 0);
@ -1221,12 +1242,8 @@ export const useConnectorDialog = () => {
); );
} }
// Track periodic indexing if enabled (for non-Google Drive connectors) // Track periodic indexing if enabled
if ( if (periodicEnabled && editingConnector.is_indexable) {
periodicEnabled &&
editingConnector.is_indexable &&
editingConnector.connector_type !== "GOOGLE_DRIVE_CONNECTOR"
) {
trackPeriodicIndexingStarted( trackPeriodicIndexingStarted(
Number(searchSpaceId), Number(searchSpaceId),
editingConnector.connector_type, editingConnector.connector_type,

View file

@ -5,13 +5,13 @@ import {
ChevronRight, ChevronRight,
File, File,
FileText, FileText,
Folder, FolderClosed,
FolderOpen, FolderOpen,
HardDrive, HardDrive,
Image, Image,
Loader2, Loader2,
Presentation, Presentation,
Sheet, FileSpreadsheet,
} from "lucide-react"; } from "lucide-react";
import { useState } from "react"; import { useState } from "react";
import { Checkbox } from "@/components/ui/checkbox"; import { Checkbox } from "@/components/ui/checkbox";
@ -53,16 +53,16 @@ interface GoogleDriveFolderTreeProps {
// Helper to get appropriate icon for file type // Helper to get appropriate icon for file type
function getFileIcon(mimeType: string, className: string = "h-4 w-4") { function getFileIcon(mimeType: string, className: string = "h-4 w-4") {
if (mimeType.includes("spreadsheet") || mimeType.includes("excel")) { if (mimeType.includes("spreadsheet") || mimeType.includes("excel")) {
return <Sheet className={`${className} text-green-600`} />; return <FileSpreadsheet className={`${className} text-green-500`} />;
} }
if (mimeType.includes("presentation") || mimeType.includes("powerpoint")) { if (mimeType.includes("presentation") || mimeType.includes("powerpoint")) {
return <Presentation className={`${className} text-orange-600`} />; return <Presentation className={`${className} text-orange-500`} />;
} }
if (mimeType.includes("document") || mimeType.includes("word") || mimeType.includes("text")) { if (mimeType.includes("document") || mimeType.includes("word") || mimeType.includes("text")) {
return <FileText className={`${className} text-blue-600`} />; return <FileText className={`${className} text-gray-500`} />;
} }
if (mimeType.includes("image")) { if (mimeType.includes("image")) {
return <Image className={`${className} text-purple-600`} />; return <Image className={`${className} text-purple-500`} />;
} }
return <File className={`${className} text-gray-500`} />; return <File className={`${className} text-gray-500`} />;
} }
@ -280,9 +280,9 @@ export function GoogleDriveFolderTree({
<div className="shrink-0"> <div className="shrink-0">
{isFolder ? ( {isFolder ? (
isExpanded ? ( isExpanded ? (
<FolderOpen className="h-3 w-3 sm:h-4 sm:w-4 text-blue-500" /> <FolderOpen className="h-3 w-3 sm:h-4 sm:w-4 text-gray-500" />
) : ( ) : (
<Folder className="h-3 w-3 sm:h-4 sm:w-4 text-gray-500" /> <FolderClosed className="h-3 w-3 sm:h-4 sm:w-4 text-gray-500" />
) )
) : ( ) : (
getFileIcon(item.mimeType, "h-3 w-3 sm:h-4 sm:w-4") getFileIcon(item.mimeType, "h-3 w-3 sm:h-4 sm:w-4")