mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-06-06 20:15:17 +02:00
Merge pull request #706 from AnishSarkar22/fix/drive-index
feat: enhance Google Drive indexing
This commit is contained in:
commit
87a174a1fd
17 changed files with 1116 additions and 279 deletions
|
|
@ -58,7 +58,7 @@ async def get_changes(
|
||||||
params = {
|
params = {
|
||||||
"pageToken": page_token,
|
"pageToken": page_token,
|
||||||
"pageSize": 100,
|
"pageSize": 100,
|
||||||
"fields": "nextPageToken, newStartPageToken, changes(fileId, removed, file(id, name, mimeType, modifiedTime, size, webViewLink, parents, trashed))",
|
"fields": "nextPageToken, newStartPageToken, changes(fileId, removed, file(id, name, mimeType, modifiedTime, md5Checksum, size, webViewLink, parents, trashed))",
|
||||||
"supportsAllDrives": True,
|
"supportsAllDrives": True,
|
||||||
"includeItemsFromAllDrives": True,
|
"includeItemsFromAllDrives": True,
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -47,7 +47,7 @@ class GoogleDriveClient:
|
||||||
async def list_files(
|
async def list_files(
|
||||||
self,
|
self,
|
||||||
query: str = "",
|
query: str = "",
|
||||||
fields: str = "nextPageToken, files(id, name, mimeType, modifiedTime, size, webViewLink, parents, owners, createdTime, description)",
|
fields: str = "nextPageToken, files(id, name, mimeType, modifiedTime, md5Checksum, size, webViewLink, parents, owners, createdTime, description)",
|
||||||
page_size: int = 100,
|
page_size: int = 100,
|
||||||
page_token: str | None = None,
|
page_token: str | None = None,
|
||||||
) -> tuple[list[dict[str, Any]], str | None, str | None]:
|
) -> tuple[list[dict[str, Any]], str | None, str | None]:
|
||||||
|
|
|
||||||
|
|
@ -102,6 +102,8 @@ async def download_and_process_file(
|
||||||
connector_info["metadata"]["file_size"] = file["size"]
|
connector_info["metadata"]["file_size"] = file["size"]
|
||||||
if "webViewLink" in file:
|
if "webViewLink" in file:
|
||||||
connector_info["metadata"]["web_view_link"] = file["webViewLink"]
|
connector_info["metadata"]["web_view_link"] = file["webViewLink"]
|
||||||
|
if "md5Checksum" in file:
|
||||||
|
connector_info["metadata"]["md5_checksum"] = file["md5Checksum"]
|
||||||
|
|
||||||
if is_google_workspace_file(mime_type):
|
if is_google_workspace_file(mime_type):
|
||||||
connector_info["metadata"]["exported_as"] = "pdf"
|
connector_info["metadata"]["exported_as"] = "pdf"
|
||||||
|
|
|
||||||
|
|
@ -157,7 +157,7 @@ async def get_file_by_id(
|
||||||
try:
|
try:
|
||||||
file, error = await client.get_file_metadata(
|
file, error = await client.get_file_metadata(
|
||||||
file_id,
|
file_id,
|
||||||
fields="id, name, mimeType, parents, createdTime, modifiedTime, size, webViewLink, iconLink",
|
fields="id, name, mimeType, parents, createdTime, modifiedTime, md5Checksum, size, webViewLink, iconLink",
|
||||||
)
|
)
|
||||||
|
|
||||||
if error:
|
if error:
|
||||||
|
|
@ -228,7 +228,7 @@ async def list_folder_contents(
|
||||||
while True:
|
while True:
|
||||||
items, next_token, error = await client.list_files(
|
items, next_token, error = await client.list_files(
|
||||||
query=query,
|
query=query,
|
||||||
fields="files(id, name, mimeType, parents, createdTime, modifiedTime, size, webViewLink, iconLink)",
|
fields="files(id, name, mimeType, parents, createdTime, modifiedTime, md5Checksum, size, webViewLink, iconLink)",
|
||||||
page_size=1000, # Max allowed by Google Drive API
|
page_size=1000, # Max allowed by Google Drive API
|
||||||
page_token=page_token,
|
page_token=page_token,
|
||||||
)
|
)
|
||||||
|
|
|
||||||
|
|
@ -1716,7 +1716,7 @@ async def run_google_drive_indexing(
|
||||||
connector_id: int,
|
connector_id: int,
|
||||||
search_space_id: int,
|
search_space_id: int,
|
||||||
user_id: str,
|
user_id: str,
|
||||||
items_dict: dict, # Dictionary with 'folders' and 'files' lists
|
items_dict: dict, # Dictionary with 'folders', 'files', and 'indexing_options'
|
||||||
):
|
):
|
||||||
"""Runs the Google Drive indexing task for folders and files with notifications."""
|
"""Runs the Google Drive indexing task for folders and files with notifications."""
|
||||||
from uuid import UUID
|
from uuid import UUID
|
||||||
|
|
@ -1730,6 +1730,7 @@ async def run_google_drive_indexing(
|
||||||
|
|
||||||
# Parse the structured data
|
# Parse the structured data
|
||||||
items = GoogleDriveIndexRequest(**items_dict)
|
items = GoogleDriveIndexRequest(**items_dict)
|
||||||
|
indexing_options = items.indexing_options
|
||||||
total_indexed = 0
|
total_indexed = 0
|
||||||
errors = []
|
errors = []
|
||||||
|
|
||||||
|
|
@ -1765,7 +1766,7 @@ async def run_google_drive_indexing(
|
||||||
stage="fetching",
|
stage="fetching",
|
||||||
)
|
)
|
||||||
|
|
||||||
# Index each folder
|
# Index each folder with indexing options
|
||||||
for folder in items.folders:
|
for folder in items.folders:
|
||||||
try:
|
try:
|
||||||
indexed_count, error_message = await index_google_drive_files(
|
indexed_count, error_message = await index_google_drive_files(
|
||||||
|
|
@ -1775,8 +1776,10 @@ async def run_google_drive_indexing(
|
||||||
user_id,
|
user_id,
|
||||||
folder_id=folder.id,
|
folder_id=folder.id,
|
||||||
folder_name=folder.name,
|
folder_name=folder.name,
|
||||||
use_delta_sync=True,
|
use_delta_sync=indexing_options.incremental_sync,
|
||||||
update_last_indexed=False,
|
update_last_indexed=False,
|
||||||
|
max_files=indexing_options.max_files_per_folder,
|
||||||
|
include_subfolders=indexing_options.include_subfolders,
|
||||||
)
|
)
|
||||||
if error_message:
|
if error_message:
|
||||||
errors.append(f"Folder '{folder.name}': {error_message}")
|
errors.append(f"Folder '{folder.name}': {error_message}")
|
||||||
|
|
@ -1837,6 +1840,8 @@ async def run_google_drive_indexing(
|
||||||
|
|
||||||
# Update notification on completion
|
# Update notification on completion
|
||||||
if notification:
|
if notification:
|
||||||
|
# Refresh notification to reload attributes that may have been expired by earlier commits
|
||||||
|
await session.refresh(notification)
|
||||||
await NotificationService.connector_indexing.notify_indexing_completed(
|
await NotificationService.connector_indexing.notify_indexing_completed(
|
||||||
session=session,
|
session=session,
|
||||||
notification=notification,
|
notification=notification,
|
||||||
|
|
|
||||||
|
|
@ -10,7 +10,7 @@ from .documents import (
|
||||||
ExtensionDocumentMetadata,
|
ExtensionDocumentMetadata,
|
||||||
PaginatedResponse,
|
PaginatedResponse,
|
||||||
)
|
)
|
||||||
from .google_drive import DriveItem, GoogleDriveIndexRequest
|
from .google_drive import DriveItem, GoogleDriveIndexingOptions, GoogleDriveIndexRequest
|
||||||
from .logs import LogBase, LogCreate, LogFilter, LogRead, LogUpdate
|
from .logs import LogBase, LogCreate, LogFilter, LogRead, LogUpdate
|
||||||
from .new_chat import (
|
from .new_chat import (
|
||||||
ChatMessage,
|
ChatMessage,
|
||||||
|
|
@ -94,6 +94,7 @@ __all__ = [
|
||||||
"ExtensionDocumentMetadata",
|
"ExtensionDocumentMetadata",
|
||||||
"GlobalNewLLMConfigRead",
|
"GlobalNewLLMConfigRead",
|
||||||
"GoogleDriveIndexRequest",
|
"GoogleDriveIndexRequest",
|
||||||
|
"GoogleDriveIndexingOptions",
|
||||||
# Base schemas
|
# Base schemas
|
||||||
"IDModel",
|
"IDModel",
|
||||||
# RBAC schemas
|
# RBAC schemas
|
||||||
|
|
|
||||||
|
|
@ -10,6 +10,25 @@ class DriveItem(BaseModel):
|
||||||
name: str = Field(..., description="Item display name")
|
name: str = Field(..., description="Item display name")
|
||||||
|
|
||||||
|
|
||||||
|
class GoogleDriveIndexingOptions(BaseModel):
|
||||||
|
"""Indexing options for Google Drive connector."""
|
||||||
|
|
||||||
|
max_files_per_folder: int = Field(
|
||||||
|
default=100,
|
||||||
|
ge=1,
|
||||||
|
le=1000,
|
||||||
|
description="Maximum number of files to index from each folder (1-1000)",
|
||||||
|
)
|
||||||
|
incremental_sync: bool = Field(
|
||||||
|
default=True,
|
||||||
|
description="Only sync changes since last index (faster). Disable for a full re-index.",
|
||||||
|
)
|
||||||
|
include_subfolders: bool = Field(
|
||||||
|
default=True,
|
||||||
|
description="Recursively index files in subfolders of selected folders",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
class GoogleDriveIndexRequest(BaseModel):
|
class GoogleDriveIndexRequest(BaseModel):
|
||||||
"""Request body for indexing Google Drive content."""
|
"""Request body for indexing Google Drive content."""
|
||||||
|
|
||||||
|
|
@ -19,6 +38,10 @@ class GoogleDriveIndexRequest(BaseModel):
|
||||||
files: list[DriveItem] = Field(
|
files: list[DriveItem] = Field(
|
||||||
default_factory=list, description="List of specific files to index"
|
default_factory=list, description="List of specific files to index"
|
||||||
)
|
)
|
||||||
|
indexing_options: GoogleDriveIndexingOptions = Field(
|
||||||
|
default_factory=GoogleDriveIndexingOptions,
|
||||||
|
description="Indexing configuration options",
|
||||||
|
)
|
||||||
|
|
||||||
def has_items(self) -> bool:
|
def has_items(self) -> bool:
|
||||||
"""Check if any items are selected."""
|
"""Check if any items are selected."""
|
||||||
|
|
|
||||||
|
|
@ -461,7 +461,7 @@ def index_google_drive_files_task(
|
||||||
connector_id: int,
|
connector_id: int,
|
||||||
search_space_id: int,
|
search_space_id: int,
|
||||||
user_id: str,
|
user_id: str,
|
||||||
items_dict: dict, # Dictionary with 'folders' and 'files' lists
|
items_dict: dict, # Dictionary with 'folders', 'files', and 'indexing_options'
|
||||||
):
|
):
|
||||||
"""Celery task to index Google Drive folders and files."""
|
"""Celery task to index Google Drive folders and files."""
|
||||||
import asyncio
|
import asyncio
|
||||||
|
|
@ -486,7 +486,7 @@ async def _index_google_drive_files(
|
||||||
connector_id: int,
|
connector_id: int,
|
||||||
search_space_id: int,
|
search_space_id: int,
|
||||||
user_id: str,
|
user_id: str,
|
||||||
items_dict: dict, # Dictionary with 'folders' and 'files' lists
|
items_dict: dict, # Dictionary with 'folders', 'files', and 'indexing_options'
|
||||||
):
|
):
|
||||||
"""Index Google Drive folders and files with new session."""
|
"""Index Google Drive folders and files with new session."""
|
||||||
from app.routes.search_source_connectors_routes import (
|
from app.routes.search_source_connectors_routes import (
|
||||||
|
|
|
||||||
|
|
@ -72,6 +72,7 @@ async def _check_and_trigger_schedules():
|
||||||
index_elasticsearch_documents_task,
|
index_elasticsearch_documents_task,
|
||||||
index_github_repos_task,
|
index_github_repos_task,
|
||||||
index_google_calendar_events_task,
|
index_google_calendar_events_task,
|
||||||
|
index_google_drive_files_task,
|
||||||
index_google_gmail_messages_task,
|
index_google_gmail_messages_task,
|
||||||
index_jira_issues_task,
|
index_jira_issues_task,
|
||||||
index_linear_issues_task,
|
index_linear_issues_task,
|
||||||
|
|
@ -96,6 +97,7 @@ async def _check_and_trigger_schedules():
|
||||||
SearchSourceConnectorType.LUMA_CONNECTOR: index_luma_events_task,
|
SearchSourceConnectorType.LUMA_CONNECTOR: index_luma_events_task,
|
||||||
SearchSourceConnectorType.ELASTICSEARCH_CONNECTOR: index_elasticsearch_documents_task,
|
SearchSourceConnectorType.ELASTICSEARCH_CONNECTOR: index_elasticsearch_documents_task,
|
||||||
SearchSourceConnectorType.WEBCRAWLER_CONNECTOR: index_crawled_urls_task,
|
SearchSourceConnectorType.WEBCRAWLER_CONNECTOR: index_crawled_urls_task,
|
||||||
|
SearchSourceConnectorType.GOOGLE_DRIVE_CONNECTOR: index_google_drive_files_task,
|
||||||
}
|
}
|
||||||
|
|
||||||
# Trigger indexing for each due connector
|
# Trigger indexing for each due connector
|
||||||
|
|
@ -106,13 +108,57 @@ async def _check_and_trigger_schedules():
|
||||||
f"Triggering periodic indexing for connector {connector.id} "
|
f"Triggering periodic indexing for connector {connector.id} "
|
||||||
f"({connector.connector_type.value})"
|
f"({connector.connector_type.value})"
|
||||||
)
|
)
|
||||||
task.delay(
|
|
||||||
connector.id,
|
# Special handling for Google Drive - uses config for folder/file selection
|
||||||
connector.search_space_id,
|
if (
|
||||||
str(connector.user_id),
|
connector.connector_type
|
||||||
None, # start_date - uses last_indexed_at
|
== SearchSourceConnectorType.GOOGLE_DRIVE_CONNECTOR
|
||||||
None, # end_date - uses now
|
):
|
||||||
)
|
connector_config = connector.config or {}
|
||||||
|
selected_folders = connector_config.get("selected_folders", [])
|
||||||
|
selected_files = connector_config.get("selected_files", [])
|
||||||
|
indexing_options = connector_config.get(
|
||||||
|
"indexing_options",
|
||||||
|
{
|
||||||
|
"max_files_per_folder": 100,
|
||||||
|
"incremental_sync": True,
|
||||||
|
"include_subfolders": True,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
if selected_folders or selected_files:
|
||||||
|
task.delay(
|
||||||
|
connector.id,
|
||||||
|
connector.search_space_id,
|
||||||
|
str(connector.user_id),
|
||||||
|
{
|
||||||
|
"folders": selected_folders,
|
||||||
|
"files": selected_files,
|
||||||
|
"indexing_options": indexing_options,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
# No folders/files selected - skip indexing but still update next_scheduled_at
|
||||||
|
# to prevent checking every minute
|
||||||
|
logger.info(
|
||||||
|
f"Google Drive connector {connector.id} has no folders or files selected, "
|
||||||
|
"skipping periodic indexing (will check again at next scheduled time)"
|
||||||
|
)
|
||||||
|
from datetime import timedelta
|
||||||
|
|
||||||
|
connector.next_scheduled_at = now + timedelta(
|
||||||
|
minutes=connector.indexing_frequency_minutes
|
||||||
|
)
|
||||||
|
await session.commit()
|
||||||
|
continue
|
||||||
|
else:
|
||||||
|
task.delay(
|
||||||
|
connector.id,
|
||||||
|
connector.search_space_id,
|
||||||
|
str(connector.user_id),
|
||||||
|
None, # start_date - uses last_indexed_at
|
||||||
|
None, # end_date - uses now
|
||||||
|
)
|
||||||
|
|
||||||
# Update next_scheduled_at for next run
|
# Update next_scheduled_at for next run
|
||||||
from datetime import timedelta
|
from datetime import timedelta
|
||||||
|
|
|
||||||
|
|
@ -37,6 +37,7 @@ async def index_google_drive_files(
|
||||||
use_delta_sync: bool = True,
|
use_delta_sync: bool = True,
|
||||||
update_last_indexed: bool = True,
|
update_last_indexed: bool = True,
|
||||||
max_files: int = 500,
|
max_files: int = 500,
|
||||||
|
include_subfolders: bool = False,
|
||||||
) -> tuple[int, str | None]:
|
) -> tuple[int, str | None]:
|
||||||
"""
|
"""
|
||||||
Index Google Drive files for a specific connector.
|
Index Google Drive files for a specific connector.
|
||||||
|
|
@ -51,6 +52,7 @@ async def index_google_drive_files(
|
||||||
use_delta_sync: Whether to use change tracking for incremental sync
|
use_delta_sync: Whether to use change tracking for incremental sync
|
||||||
update_last_indexed: Whether to update last_indexed_at timestamp
|
update_last_indexed: Whether to update last_indexed_at timestamp
|
||||||
max_files: Maximum number of files to index
|
max_files: Maximum number of files to index
|
||||||
|
include_subfolders: Whether to recursively index files in subfolders
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
Tuple of (number_of_indexed_files, error_message)
|
Tuple of (number_of_indexed_files, error_message)
|
||||||
|
|
@ -144,6 +146,7 @@ async def index_google_drive_files(
|
||||||
task_logger=task_logger,
|
task_logger=task_logger,
|
||||||
log_entry=log_entry,
|
log_entry=log_entry,
|
||||||
max_files=max_files,
|
max_files=max_files,
|
||||||
|
include_subfolders=include_subfolders,
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
logger.info(f"Using full scan for connector {connector_id}")
|
logger.info(f"Using full scan for connector {connector_id}")
|
||||||
|
|
@ -159,6 +162,7 @@ async def index_google_drive_files(
|
||||||
task_logger=task_logger,
|
task_logger=task_logger,
|
||||||
log_entry=log_entry,
|
log_entry=log_entry,
|
||||||
max_files=max_files,
|
max_files=max_files,
|
||||||
|
include_subfolders=include_subfolders,
|
||||||
)
|
)
|
||||||
|
|
||||||
documents_indexed, documents_skipped = result
|
documents_indexed, documents_skipped = result
|
||||||
|
|
@ -168,6 +172,9 @@ async def index_google_drive_files(
|
||||||
if new_token and not token_error:
|
if new_token and not token_error:
|
||||||
from sqlalchemy.orm.attributes import flag_modified
|
from sqlalchemy.orm.attributes import flag_modified
|
||||||
|
|
||||||
|
# Refresh connector to reload attributes that may have been expired by earlier commits
|
||||||
|
await session.refresh(connector)
|
||||||
|
|
||||||
if "folder_tokens" not in connector.config:
|
if "folder_tokens" not in connector.config:
|
||||||
connector.config["folder_tokens"] = {}
|
connector.config["folder_tokens"] = {}
|
||||||
connector.config["folder_tokens"][target_folder_id] = new_token
|
connector.config["folder_tokens"][target_folder_id] = new_token
|
||||||
|
|
@ -375,60 +382,89 @@ async def _index_full_scan(
|
||||||
task_logger: TaskLoggingService,
|
task_logger: TaskLoggingService,
|
||||||
log_entry: any,
|
log_entry: any,
|
||||||
max_files: int,
|
max_files: int,
|
||||||
|
include_subfolders: bool = False,
|
||||||
) -> tuple[int, int]:
|
) -> tuple[int, int]:
|
||||||
"""Perform full scan indexing of a folder."""
|
"""Perform full scan indexing of a folder."""
|
||||||
await task_logger.log_task_progress(
|
await task_logger.log_task_progress(
|
||||||
log_entry,
|
log_entry,
|
||||||
f"Starting full scan of folder: {folder_name}",
|
f"Starting full scan of folder: {folder_name} (include_subfolders={include_subfolders})",
|
||||||
{"stage": "full_scan", "folder_id": folder_id},
|
{
|
||||||
|
"stage": "full_scan",
|
||||||
|
"folder_id": folder_id,
|
||||||
|
"include_subfolders": include_subfolders,
|
||||||
|
},
|
||||||
)
|
)
|
||||||
|
|
||||||
documents_indexed = 0
|
documents_indexed = 0
|
||||||
documents_skipped = 0
|
documents_skipped = 0
|
||||||
page_token = None
|
|
||||||
files_processed = 0
|
files_processed = 0
|
||||||
|
|
||||||
while files_processed < max_files:
|
# Queue of folders to process: (folder_id, folder_name)
|
||||||
files, next_token, error = await get_files_in_folder(
|
folders_to_process = [(folder_id, folder_name)]
|
||||||
drive_client, folder_id, include_subfolders=False, page_token=page_token
|
|
||||||
)
|
|
||||||
|
|
||||||
if error:
|
while folders_to_process and files_processed < max_files:
|
||||||
logger.error(f"Error listing files: {error}")
|
current_folder_id, current_folder_name = folders_to_process.pop(0)
|
||||||
break
|
logger.info(f"Processing folder: {current_folder_name} ({current_folder_id})")
|
||||||
|
page_token = None
|
||||||
|
|
||||||
if not files:
|
while files_processed < max_files:
|
||||||
break
|
# Get files and folders in current folder
|
||||||
|
# include_subfolders=True here so we get folder items to queue them
|
||||||
for file in files:
|
files, next_token, error = await get_files_in_folder(
|
||||||
if files_processed >= max_files:
|
drive_client,
|
||||||
break
|
current_folder_id,
|
||||||
|
include_subfolders=True,
|
||||||
files_processed += 1
|
page_token=page_token,
|
||||||
|
|
||||||
indexed, skipped = await _process_single_file(
|
|
||||||
drive_client=drive_client,
|
|
||||||
session=session,
|
|
||||||
file=file,
|
|
||||||
connector_id=connector_id,
|
|
||||||
search_space_id=search_space_id,
|
|
||||||
user_id=user_id,
|
|
||||||
task_logger=task_logger,
|
|
||||||
log_entry=log_entry,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
documents_indexed += indexed
|
if error:
|
||||||
documents_skipped += skipped
|
logger.error(f"Error listing files in {current_folder_name}: {error}")
|
||||||
|
break
|
||||||
|
|
||||||
if documents_indexed % 10 == 0 and documents_indexed > 0:
|
if not files:
|
||||||
await session.commit()
|
break
|
||||||
logger.info(
|
|
||||||
f"Committed batch: {documents_indexed} files indexed so far"
|
for file in files:
|
||||||
|
if files_processed >= max_files:
|
||||||
|
break
|
||||||
|
|
||||||
|
mime_type = file.get("mimeType", "")
|
||||||
|
|
||||||
|
# If this is a folder and include_subfolders is enabled, queue it for processing
|
||||||
|
if mime_type == "application/vnd.google-apps.folder":
|
||||||
|
if include_subfolders:
|
||||||
|
folders_to_process.append(
|
||||||
|
(file["id"], file.get("name", "Unknown"))
|
||||||
|
)
|
||||||
|
logger.debug(f"Queued subfolder: {file.get('name', 'Unknown')}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Process the file
|
||||||
|
files_processed += 1
|
||||||
|
|
||||||
|
indexed, skipped = await _process_single_file(
|
||||||
|
drive_client=drive_client,
|
||||||
|
session=session,
|
||||||
|
file=file,
|
||||||
|
connector_id=connector_id,
|
||||||
|
search_space_id=search_space_id,
|
||||||
|
user_id=user_id,
|
||||||
|
task_logger=task_logger,
|
||||||
|
log_entry=log_entry,
|
||||||
)
|
)
|
||||||
|
|
||||||
page_token = next_token
|
documents_indexed += indexed
|
||||||
if not page_token:
|
documents_skipped += skipped
|
||||||
break
|
|
||||||
|
if documents_indexed % 10 == 0 and documents_indexed > 0:
|
||||||
|
await session.commit()
|
||||||
|
logger.info(
|
||||||
|
f"Committed batch: {documents_indexed} files indexed so far"
|
||||||
|
)
|
||||||
|
|
||||||
|
page_token = next_token
|
||||||
|
if not page_token:
|
||||||
|
break
|
||||||
|
|
||||||
logger.info(
|
logger.info(
|
||||||
f"Full scan complete: {documents_indexed} indexed, {documents_skipped} skipped"
|
f"Full scan complete: {documents_indexed} indexed, {documents_skipped} skipped"
|
||||||
|
|
@ -448,8 +484,13 @@ async def _index_with_delta_sync(
|
||||||
task_logger: TaskLoggingService,
|
task_logger: TaskLoggingService,
|
||||||
log_entry: any,
|
log_entry: any,
|
||||||
max_files: int,
|
max_files: int,
|
||||||
|
include_subfolders: bool = False,
|
||||||
) -> tuple[int, int]:
|
) -> tuple[int, int]:
|
||||||
"""Perform delta sync indexing using change tracking."""
|
"""Perform delta sync indexing using change tracking.
|
||||||
|
|
||||||
|
Note: include_subfolders is accepted for API consistency but delta sync
|
||||||
|
automatically tracks changes across all folders including subfolders.
|
||||||
|
"""
|
||||||
await task_logger.log_task_progress(
|
await task_logger.log_task_progress(
|
||||||
log_entry,
|
log_entry,
|
||||||
f"Starting delta sync from token: {start_page_token[:20]}...",
|
f"Starting delta sync from token: {start_page_token[:20]}...",
|
||||||
|
|
@ -515,6 +556,131 @@ async def _index_with_delta_sync(
|
||||||
return documents_indexed, documents_skipped
|
return documents_indexed, documents_skipped
|
||||||
|
|
||||||
|
|
||||||
|
async def _check_rename_only_update(
|
||||||
|
session: AsyncSession,
|
||||||
|
file: dict,
|
||||||
|
search_space_id: int,
|
||||||
|
) -> tuple[bool, str | None]:
|
||||||
|
"""
|
||||||
|
Check if a file only needs a rename update (no content change).
|
||||||
|
|
||||||
|
Uses md5Checksum comparison (preferred) or modifiedTime (fallback for Google Workspace files)
|
||||||
|
to detect if content has changed. This optimization prevents unnecessary ETL API calls
|
||||||
|
(Docling/LlamaCloud) for rename-only operations.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
session: Database session
|
||||||
|
file: File metadata from Google Drive API
|
||||||
|
search_space_id: ID of the search space
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Tuple of (is_rename_only, message)
|
||||||
|
- (True, message): Only filename changed, document was updated
|
||||||
|
- (False, None): Content changed or new file, needs full processing
|
||||||
|
"""
|
||||||
|
from sqlalchemy import select
|
||||||
|
from sqlalchemy.orm.attributes import flag_modified
|
||||||
|
|
||||||
|
from app.db import Document
|
||||||
|
|
||||||
|
file_id = file.get("id")
|
||||||
|
file_name = file.get("name", "Unknown")
|
||||||
|
incoming_md5 = file.get("md5Checksum") # None for Google Workspace files
|
||||||
|
incoming_modified_time = file.get("modifiedTime")
|
||||||
|
|
||||||
|
if not file_id:
|
||||||
|
return False, None
|
||||||
|
|
||||||
|
# Try to find existing document by file_id-based hash (primary method)
|
||||||
|
primary_hash = generate_unique_identifier_hash(
|
||||||
|
DocumentType.GOOGLE_DRIVE_FILE, file_id, search_space_id
|
||||||
|
)
|
||||||
|
existing_document = await check_document_by_unique_identifier(session, primary_hash)
|
||||||
|
|
||||||
|
# If not found by primary hash, try searching by metadata (for legacy documents)
|
||||||
|
if not existing_document:
|
||||||
|
result = await session.execute(
|
||||||
|
select(Document).where(
|
||||||
|
Document.search_space_id == search_space_id,
|
||||||
|
Document.document_type == DocumentType.GOOGLE_DRIVE_FILE,
|
||||||
|
Document.document_metadata["google_drive_file_id"].astext == file_id,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
existing_document = result.scalar_one_or_none()
|
||||||
|
if existing_document:
|
||||||
|
logger.debug(f"Found legacy document by metadata for file_id: {file_id}")
|
||||||
|
|
||||||
|
if not existing_document:
|
||||||
|
# New file, needs full processing
|
||||||
|
return False, None
|
||||||
|
|
||||||
|
# Get stored checksums/timestamps from document metadata
|
||||||
|
doc_metadata = existing_document.document_metadata or {}
|
||||||
|
stored_md5 = doc_metadata.get("md5_checksum")
|
||||||
|
stored_modified_time = doc_metadata.get("modified_time")
|
||||||
|
|
||||||
|
# Determine if content changed using md5Checksum (preferred) or modifiedTime (fallback)
|
||||||
|
content_unchanged = False
|
||||||
|
|
||||||
|
if incoming_md5 and stored_md5:
|
||||||
|
# Best case: Compare md5 checksums (only changes when content changes, not on rename)
|
||||||
|
content_unchanged = incoming_md5 == stored_md5
|
||||||
|
logger.debug(f"MD5 comparison for {file_name}: unchanged={content_unchanged}")
|
||||||
|
elif incoming_md5 and not stored_md5:
|
||||||
|
# Have incoming md5 but no stored md5 (legacy doc) - need to reprocess to store it
|
||||||
|
logger.debug(
|
||||||
|
f"No stored md5 for {file_name}, will reprocess to store md5_checksum"
|
||||||
|
)
|
||||||
|
return False, None
|
||||||
|
elif not incoming_md5:
|
||||||
|
# Google Workspace file (no md5Checksum available) - fall back to modifiedTime
|
||||||
|
# Note: modifiedTime is less reliable as it changes on rename too, but it's the best we have
|
||||||
|
if incoming_modified_time and stored_modified_time:
|
||||||
|
content_unchanged = incoming_modified_time == stored_modified_time
|
||||||
|
logger.debug(
|
||||||
|
f"ModifiedTime fallback for Google Workspace file {file_name}: unchanged={content_unchanged}"
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
# No stored modifiedTime (legacy) - reprocess to store it
|
||||||
|
return False, None
|
||||||
|
|
||||||
|
if content_unchanged:
|
||||||
|
# Content hasn't changed - check if filename changed
|
||||||
|
old_name = doc_metadata.get("FILE_NAME") or doc_metadata.get(
|
||||||
|
"google_drive_file_name"
|
||||||
|
)
|
||||||
|
|
||||||
|
if old_name and old_name != file_name:
|
||||||
|
# Rename-only update - update the document without re-processing
|
||||||
|
existing_document.title = file_name
|
||||||
|
if not existing_document.document_metadata:
|
||||||
|
existing_document.document_metadata = {}
|
||||||
|
existing_document.document_metadata["FILE_NAME"] = file_name
|
||||||
|
existing_document.document_metadata["google_drive_file_name"] = file_name
|
||||||
|
# Also update modified_time for Google Workspace files (since it changed on rename)
|
||||||
|
if incoming_modified_time:
|
||||||
|
existing_document.document_metadata["modified_time"] = (
|
||||||
|
incoming_modified_time
|
||||||
|
)
|
||||||
|
flag_modified(existing_document, "document_metadata")
|
||||||
|
await session.commit()
|
||||||
|
|
||||||
|
logger.info(
|
||||||
|
f"Rename-only update: '{old_name}' → '{file_name}' (skipped ETL)"
|
||||||
|
)
|
||||||
|
return (
|
||||||
|
True,
|
||||||
|
f"File renamed: '{old_name}' → '{file_name}' (no content change)",
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
# Neither content nor name changed
|
||||||
|
logger.debug(f"File unchanged: {file_name}")
|
||||||
|
return True, "File unchanged (same content and name)"
|
||||||
|
|
||||||
|
# Content changed - needs full processing
|
||||||
|
return False, None
|
||||||
|
|
||||||
|
|
||||||
async def _process_single_file(
|
async def _process_single_file(
|
||||||
drive_client: GoogleDriveClient,
|
drive_client: GoogleDriveClient,
|
||||||
session: AsyncSession,
|
session: AsyncSession,
|
||||||
|
|
@ -537,6 +703,27 @@ async def _process_single_file(
|
||||||
try:
|
try:
|
||||||
logger.info(f"Processing file: {file_name} ({mime_type})")
|
logger.info(f"Processing file: {file_name} ({mime_type})")
|
||||||
|
|
||||||
|
# Early check: Is this a rename-only update?
|
||||||
|
# This optimization prevents downloading and ETL processing for files
|
||||||
|
# where only the name changed but content is the same.
|
||||||
|
is_rename_only, rename_message = await _check_rename_only_update(
|
||||||
|
session=session,
|
||||||
|
file=file,
|
||||||
|
search_space_id=search_space_id,
|
||||||
|
)
|
||||||
|
|
||||||
|
if is_rename_only:
|
||||||
|
await task_logger.log_task_progress(
|
||||||
|
log_entry,
|
||||||
|
f"Skipped ETL for {file_name}: {rename_message}",
|
||||||
|
{"status": "rename_only", "reason": rename_message},
|
||||||
|
)
|
||||||
|
# Return 1 for renamed files (they are "indexed" in the sense that they're updated)
|
||||||
|
# Return 0 for unchanged files
|
||||||
|
if "renamed" in (rename_message or "").lower():
|
||||||
|
return 1, 0
|
||||||
|
return 0, 1
|
||||||
|
|
||||||
_, error, _ = await download_and_process_file(
|
_, error, _ = await download_and_process_file(
|
||||||
client=drive_client,
|
client=drive_client,
|
||||||
file=file,
|
file=file,
|
||||||
|
|
@ -564,7 +751,15 @@ async def _process_single_file(
|
||||||
|
|
||||||
|
|
||||||
async def _remove_document(session: AsyncSession, file_id: str, search_space_id: int):
|
async def _remove_document(session: AsyncSession, file_id: str, search_space_id: int):
|
||||||
"""Remove a document that was deleted in Drive."""
|
"""Remove a document that was deleted in Drive.
|
||||||
|
|
||||||
|
Handles both new (file_id-based) and legacy (filename-based) hash schemes.
|
||||||
|
"""
|
||||||
|
from sqlalchemy import select
|
||||||
|
|
||||||
|
from app.db import Document
|
||||||
|
|
||||||
|
# First try with file_id-based hash (new method)
|
||||||
unique_identifier_hash = generate_unique_identifier_hash(
|
unique_identifier_hash = generate_unique_identifier_hash(
|
||||||
DocumentType.GOOGLE_DRIVE_FILE, file_id, search_space_id
|
DocumentType.GOOGLE_DRIVE_FILE, file_id, search_space_id
|
||||||
)
|
)
|
||||||
|
|
@ -573,6 +768,19 @@ async def _remove_document(session: AsyncSession, file_id: str, search_space_id:
|
||||||
session, unique_identifier_hash
|
session, unique_identifier_hash
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# If not found, search by metadata (for legacy documents with filename-based hash)
|
||||||
|
if not existing_document:
|
||||||
|
result = await session.execute(
|
||||||
|
select(Document).where(
|
||||||
|
Document.search_space_id == search_space_id,
|
||||||
|
Document.document_type == DocumentType.GOOGLE_DRIVE_FILE,
|
||||||
|
Document.document_metadata["google_drive_file_id"].astext == file_id,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
existing_document = result.scalar_one_or_none()
|
||||||
|
if existing_document:
|
||||||
|
logger.info(f"Found legacy document by metadata for file_id: {file_id}")
|
||||||
|
|
||||||
if existing_document:
|
if existing_document:
|
||||||
await session.delete(existing_document)
|
await session.delete(existing_document)
|
||||||
logger.info(f"Removed deleted file document: {file_id}")
|
logger.info(f"Removed deleted file document: {file_id}")
|
||||||
|
|
|
||||||
|
|
@ -31,6 +31,7 @@ from app.utils.document_converters import (
|
||||||
|
|
||||||
from .base import (
|
from .base import (
|
||||||
check_document_by_unique_identifier,
|
check_document_by_unique_identifier,
|
||||||
|
check_duplicate_document,
|
||||||
get_current_timestamp,
|
get_current_timestamp,
|
||||||
)
|
)
|
||||||
from .markdown_processor import add_received_markdown_file_document
|
from .markdown_processor import add_received_markdown_file_document
|
||||||
|
|
@ -49,6 +50,160 @@ LLAMACLOUD_RETRYABLE_EXCEPTIONS = (
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def get_google_drive_unique_identifier(
|
||||||
|
connector: dict | None,
|
||||||
|
filename: str,
|
||||||
|
search_space_id: int,
|
||||||
|
) -> tuple[str, str | None]:
|
||||||
|
"""
|
||||||
|
Get unique identifier hash for a file, with special handling for Google Drive.
|
||||||
|
|
||||||
|
For Google Drive files, uses file_id as the unique identifier (doesn't change on rename).
|
||||||
|
For other files, uses filename.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
connector: Optional connector info dict with type and metadata
|
||||||
|
filename: The filename (used for non-Google Drive files or as fallback)
|
||||||
|
search_space_id: The search space ID
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Tuple of (primary_hash, legacy_hash or None)
|
||||||
|
- For Google Drive: (file_id_based_hash, filename_based_hash for migration)
|
||||||
|
- For other sources: (filename_based_hash, None)
|
||||||
|
"""
|
||||||
|
if connector and connector.get("type") == DocumentType.GOOGLE_DRIVE_FILE:
|
||||||
|
metadata = connector.get("metadata", {})
|
||||||
|
file_id = metadata.get("google_drive_file_id")
|
||||||
|
|
||||||
|
if file_id:
|
||||||
|
# New method: use file_id as unique identifier (doesn't change on rename)
|
||||||
|
primary_hash = generate_unique_identifier_hash(
|
||||||
|
DocumentType.GOOGLE_DRIVE_FILE, file_id, search_space_id
|
||||||
|
)
|
||||||
|
# Legacy method: for backward compatibility with existing documents
|
||||||
|
# that were indexed with filename-based hash
|
||||||
|
legacy_hash = generate_unique_identifier_hash(
|
||||||
|
DocumentType.GOOGLE_DRIVE_FILE, filename, search_space_id
|
||||||
|
)
|
||||||
|
return primary_hash, legacy_hash
|
||||||
|
|
||||||
|
# For non-Google Drive files, use filename as before
|
||||||
|
primary_hash = generate_unique_identifier_hash(
|
||||||
|
DocumentType.FILE, filename, search_space_id
|
||||||
|
)
|
||||||
|
return primary_hash, None
|
||||||
|
|
||||||
|
|
||||||
|
async def handle_existing_document_update(
|
||||||
|
session: AsyncSession,
|
||||||
|
existing_document: Document,
|
||||||
|
content_hash: str,
|
||||||
|
connector: dict | None,
|
||||||
|
filename: str,
|
||||||
|
primary_hash: str,
|
||||||
|
) -> tuple[bool, Document | None]:
|
||||||
|
"""
|
||||||
|
Handle update logic for an existing document.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
session: Database session
|
||||||
|
existing_document: The existing document found in database
|
||||||
|
content_hash: Hash of the new content
|
||||||
|
connector: Optional connector info
|
||||||
|
filename: Current filename
|
||||||
|
primary_hash: The primary hash (file_id based for Google Drive)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Tuple of (should_skip_processing, document_to_return)
|
||||||
|
- (True, document): Content unchanged, just return existing document
|
||||||
|
- (False, None): Content changed, need to re-process
|
||||||
|
"""
|
||||||
|
# Check if this document needs hash migration (found via legacy hash)
|
||||||
|
if existing_document.unique_identifier_hash != primary_hash:
|
||||||
|
existing_document.unique_identifier_hash = primary_hash
|
||||||
|
logging.info(f"Migrated document to file_id-based identifier: {filename}")
|
||||||
|
|
||||||
|
# Check if content has changed
|
||||||
|
if existing_document.content_hash == content_hash:
|
||||||
|
# Content unchanged - check if we need to update metadata (e.g., filename changed)
|
||||||
|
if connector and connector.get("type") == DocumentType.GOOGLE_DRIVE_FILE:
|
||||||
|
connector_metadata = connector.get("metadata", {})
|
||||||
|
new_name = connector_metadata.get("google_drive_file_name")
|
||||||
|
# Check both possible keys for old name (FILE_NAME is used in stored documents)
|
||||||
|
doc_metadata = existing_document.document_metadata or {}
|
||||||
|
old_name = doc_metadata.get("FILE_NAME") or doc_metadata.get(
|
||||||
|
"google_drive_file_name"
|
||||||
|
)
|
||||||
|
|
||||||
|
if new_name and old_name and old_name != new_name:
|
||||||
|
# File was renamed - update title and metadata, skip expensive processing
|
||||||
|
from sqlalchemy.orm.attributes import flag_modified
|
||||||
|
|
||||||
|
existing_document.title = new_name
|
||||||
|
if not existing_document.document_metadata:
|
||||||
|
existing_document.document_metadata = {}
|
||||||
|
existing_document.document_metadata["FILE_NAME"] = new_name
|
||||||
|
existing_document.document_metadata["google_drive_file_name"] = new_name
|
||||||
|
flag_modified(existing_document, "document_metadata")
|
||||||
|
await session.commit()
|
||||||
|
logging.info(
|
||||||
|
f"File renamed in Google Drive: '{old_name}' → '{new_name}' (no re-processing needed)"
|
||||||
|
)
|
||||||
|
|
||||||
|
logging.info(f"Document for file {filename} unchanged. Skipping.")
|
||||||
|
return True, existing_document
|
||||||
|
else:
|
||||||
|
# Content has changed - need to re-process
|
||||||
|
logging.info(f"Content changed for file {filename}. Updating document.")
|
||||||
|
return False, None
|
||||||
|
|
||||||
|
|
||||||
|
async def find_existing_document_with_migration(
|
||||||
|
session: AsyncSession,
|
||||||
|
primary_hash: str,
|
||||||
|
legacy_hash: str | None,
|
||||||
|
content_hash: str | None = None,
|
||||||
|
) -> Document | None:
|
||||||
|
"""
|
||||||
|
Find existing document, checking both new hash and legacy hash for migration,
|
||||||
|
with fallback to content_hash for cross-source deduplication.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
session: Database session
|
||||||
|
primary_hash: The primary hash (file_id based for Google Drive)
|
||||||
|
legacy_hash: The legacy hash (filename based) for migration, or None
|
||||||
|
content_hash: The content hash for fallback deduplication, or None
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Existing document if found, None otherwise
|
||||||
|
"""
|
||||||
|
# First check with primary hash (new method)
|
||||||
|
existing_document = await check_document_by_unique_identifier(session, primary_hash)
|
||||||
|
|
||||||
|
# If not found and we have a legacy hash, check with that (migration path)
|
||||||
|
if not existing_document and legacy_hash:
|
||||||
|
existing_document = await check_document_by_unique_identifier(
|
||||||
|
session, legacy_hash
|
||||||
|
)
|
||||||
|
if existing_document:
|
||||||
|
logging.info(
|
||||||
|
"Found legacy document (filename-based hash), will migrate to file_id-based hash"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Fallback: check by content_hash to catch duplicates from different sources
|
||||||
|
# This prevents unique constraint violations when the same content exists
|
||||||
|
# under a different unique_identifier (e.g., manual upload vs Google Drive)
|
||||||
|
if not existing_document and content_hash:
|
||||||
|
existing_document = await check_duplicate_document(session, content_hash)
|
||||||
|
if existing_document:
|
||||||
|
logging.info(
|
||||||
|
f"Found duplicate content from different source (content_hash match). "
|
||||||
|
f"Original document ID: {existing_document.id}, type: {existing_document.document_type}"
|
||||||
|
)
|
||||||
|
|
||||||
|
return existing_document
|
||||||
|
|
||||||
|
|
||||||
async def parse_with_llamacloud_retry(
|
async def parse_with_llamacloud_retry(
|
||||||
file_path: str,
|
file_path: str,
|
||||||
estimated_pages: int,
|
estimated_pages: int,
|
||||||
|
|
@ -158,6 +313,7 @@ async def add_received_file_document_using_unstructured(
|
||||||
unstructured_processed_elements: list[LangChainDocument],
|
unstructured_processed_elements: list[LangChainDocument],
|
||||||
search_space_id: int,
|
search_space_id: int,
|
||||||
user_id: str,
|
user_id: str,
|
||||||
|
connector: dict | None = None,
|
||||||
) -> Document | None:
|
) -> Document | None:
|
||||||
"""
|
"""
|
||||||
Process and store a file document using Unstructured service.
|
Process and store a file document using Unstructured service.
|
||||||
|
|
@ -168,6 +324,7 @@ async def add_received_file_document_using_unstructured(
|
||||||
unstructured_processed_elements: Processed elements from Unstructured
|
unstructured_processed_elements: Processed elements from Unstructured
|
||||||
search_space_id: ID of the search space
|
search_space_id: ID of the search space
|
||||||
user_id: ID of the user
|
user_id: ID of the user
|
||||||
|
connector: Optional connector info for Google Drive files
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
Document object if successful, None if failed
|
Document object if successful, None if failed
|
||||||
|
|
@ -177,29 +334,32 @@ async def add_received_file_document_using_unstructured(
|
||||||
unstructured_processed_elements
|
unstructured_processed_elements
|
||||||
)
|
)
|
||||||
|
|
||||||
# Generate unique identifier hash for this file
|
# Generate unique identifier hash (uses file_id for Google Drive, filename for others)
|
||||||
unique_identifier_hash = generate_unique_identifier_hash(
|
primary_hash, legacy_hash = get_google_drive_unique_identifier(
|
||||||
DocumentType.FILE, file_name, search_space_id
|
connector, file_name, search_space_id
|
||||||
)
|
)
|
||||||
|
|
||||||
# Generate content hash
|
# Generate content hash
|
||||||
content_hash = generate_content_hash(file_in_markdown, search_space_id)
|
content_hash = generate_content_hash(file_in_markdown, search_space_id)
|
||||||
|
|
||||||
# Check if document with this unique identifier already exists
|
# Check if document exists (with migration support for Google Drive and content_hash fallback)
|
||||||
existing_document = await check_document_by_unique_identifier(
|
existing_document = await find_existing_document_with_migration(
|
||||||
session, unique_identifier_hash
|
session, primary_hash, legacy_hash, content_hash
|
||||||
)
|
)
|
||||||
|
|
||||||
if existing_document:
|
if existing_document:
|
||||||
# Document exists - check if content has changed
|
# Handle existing document (rename detection, content change check)
|
||||||
if existing_document.content_hash == content_hash:
|
should_skip, doc = await handle_existing_document_update(
|
||||||
logging.info(f"Document for file {file_name} unchanged. Skipping.")
|
session,
|
||||||
return existing_document
|
existing_document,
|
||||||
else:
|
content_hash,
|
||||||
# Content has changed - update the existing document
|
connector,
|
||||||
logging.info(
|
file_name,
|
||||||
f"Content changed for file {file_name}. Updating document."
|
primary_hash,
|
||||||
)
|
)
|
||||||
|
if should_skip:
|
||||||
|
return doc
|
||||||
|
# Content changed - continue to update
|
||||||
|
|
||||||
# Get user's long context LLM (needed for both create and update)
|
# Get user's long context LLM (needed for both create and update)
|
||||||
user_llm = await get_user_long_context_llm(session, user_id, search_space_id)
|
user_llm = await get_user_long_context_llm(session, user_id, search_space_id)
|
||||||
|
|
@ -251,10 +411,15 @@ async def add_received_file_document_using_unstructured(
|
||||||
document = existing_document
|
document = existing_document
|
||||||
else:
|
else:
|
||||||
# Create new document
|
# Create new document
|
||||||
|
# Determine document type based on connector
|
||||||
|
doc_type = DocumentType.FILE
|
||||||
|
if connector and connector.get("type") == DocumentType.GOOGLE_DRIVE_FILE:
|
||||||
|
doc_type = DocumentType.GOOGLE_DRIVE_FILE
|
||||||
|
|
||||||
document = Document(
|
document = Document(
|
||||||
search_space_id=search_space_id,
|
search_space_id=search_space_id,
|
||||||
title=file_name,
|
title=file_name,
|
||||||
document_type=DocumentType.FILE,
|
document_type=doc_type,
|
||||||
document_metadata={
|
document_metadata={
|
||||||
"FILE_NAME": file_name,
|
"FILE_NAME": file_name,
|
||||||
"ETL_SERVICE": "UNSTRUCTURED",
|
"ETL_SERVICE": "UNSTRUCTURED",
|
||||||
|
|
@ -263,7 +428,7 @@ async def add_received_file_document_using_unstructured(
|
||||||
embedding=summary_embedding,
|
embedding=summary_embedding,
|
||||||
chunks=chunks,
|
chunks=chunks,
|
||||||
content_hash=content_hash,
|
content_hash=content_hash,
|
||||||
unique_identifier_hash=unique_identifier_hash,
|
unique_identifier_hash=primary_hash,
|
||||||
blocknote_document=blocknote_json,
|
blocknote_document=blocknote_json,
|
||||||
content_needs_reindexing=False,
|
content_needs_reindexing=False,
|
||||||
updated_at=get_current_timestamp(),
|
updated_at=get_current_timestamp(),
|
||||||
|
|
@ -288,6 +453,7 @@ async def add_received_file_document_using_llamacloud(
|
||||||
llamacloud_markdown_document: str,
|
llamacloud_markdown_document: str,
|
||||||
search_space_id: int,
|
search_space_id: int,
|
||||||
user_id: str,
|
user_id: str,
|
||||||
|
connector: dict | None = None,
|
||||||
) -> Document | None:
|
) -> Document | None:
|
||||||
"""
|
"""
|
||||||
Process and store document content parsed by LlamaCloud.
|
Process and store document content parsed by LlamaCloud.
|
||||||
|
|
@ -298,6 +464,7 @@ async def add_received_file_document_using_llamacloud(
|
||||||
llamacloud_markdown_document: Markdown content from LlamaCloud parsing
|
llamacloud_markdown_document: Markdown content from LlamaCloud parsing
|
||||||
search_space_id: ID of the search space
|
search_space_id: ID of the search space
|
||||||
user_id: ID of the user
|
user_id: ID of the user
|
||||||
|
connector: Optional connector info for Google Drive files
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
Document object if successful, None if failed
|
Document object if successful, None if failed
|
||||||
|
|
@ -306,29 +473,32 @@ async def add_received_file_document_using_llamacloud(
|
||||||
# Combine all markdown documents into one
|
# Combine all markdown documents into one
|
||||||
file_in_markdown = llamacloud_markdown_document
|
file_in_markdown = llamacloud_markdown_document
|
||||||
|
|
||||||
# Generate unique identifier hash for this file
|
# Generate unique identifier hash (uses file_id for Google Drive, filename for others)
|
||||||
unique_identifier_hash = generate_unique_identifier_hash(
|
primary_hash, legacy_hash = get_google_drive_unique_identifier(
|
||||||
DocumentType.FILE, file_name, search_space_id
|
connector, file_name, search_space_id
|
||||||
)
|
)
|
||||||
|
|
||||||
# Generate content hash
|
# Generate content hash
|
||||||
content_hash = generate_content_hash(file_in_markdown, search_space_id)
|
content_hash = generate_content_hash(file_in_markdown, search_space_id)
|
||||||
|
|
||||||
# Check if document with this unique identifier already exists
|
# Check if document exists (with migration support for Google Drive and content_hash fallback)
|
||||||
existing_document = await check_document_by_unique_identifier(
|
existing_document = await find_existing_document_with_migration(
|
||||||
session, unique_identifier_hash
|
session, primary_hash, legacy_hash, content_hash
|
||||||
)
|
)
|
||||||
|
|
||||||
if existing_document:
|
if existing_document:
|
||||||
# Document exists - check if content has changed
|
# Handle existing document (rename detection, content change check)
|
||||||
if existing_document.content_hash == content_hash:
|
should_skip, doc = await handle_existing_document_update(
|
||||||
logging.info(f"Document for file {file_name} unchanged. Skipping.")
|
session,
|
||||||
return existing_document
|
existing_document,
|
||||||
else:
|
content_hash,
|
||||||
# Content has changed - update the existing document
|
connector,
|
||||||
logging.info(
|
file_name,
|
||||||
f"Content changed for file {file_name}. Updating document."
|
primary_hash,
|
||||||
)
|
)
|
||||||
|
if should_skip:
|
||||||
|
return doc
|
||||||
|
# Content changed - continue to update
|
||||||
|
|
||||||
# Get user's long context LLM (needed for both create and update)
|
# Get user's long context LLM (needed for both create and update)
|
||||||
user_llm = await get_user_long_context_llm(session, user_id, search_space_id)
|
user_llm = await get_user_long_context_llm(session, user_id, search_space_id)
|
||||||
|
|
@ -380,10 +550,15 @@ async def add_received_file_document_using_llamacloud(
|
||||||
document = existing_document
|
document = existing_document
|
||||||
else:
|
else:
|
||||||
# Create new document
|
# Create new document
|
||||||
|
# Determine document type based on connector
|
||||||
|
doc_type = DocumentType.FILE
|
||||||
|
if connector and connector.get("type") == DocumentType.GOOGLE_DRIVE_FILE:
|
||||||
|
doc_type = DocumentType.GOOGLE_DRIVE_FILE
|
||||||
|
|
||||||
document = Document(
|
document = Document(
|
||||||
search_space_id=search_space_id,
|
search_space_id=search_space_id,
|
||||||
title=file_name,
|
title=file_name,
|
||||||
document_type=DocumentType.FILE,
|
document_type=doc_type,
|
||||||
document_metadata={
|
document_metadata={
|
||||||
"FILE_NAME": file_name,
|
"FILE_NAME": file_name,
|
||||||
"ETL_SERVICE": "LLAMACLOUD",
|
"ETL_SERVICE": "LLAMACLOUD",
|
||||||
|
|
@ -392,7 +567,7 @@ async def add_received_file_document_using_llamacloud(
|
||||||
embedding=summary_embedding,
|
embedding=summary_embedding,
|
||||||
chunks=chunks,
|
chunks=chunks,
|
||||||
content_hash=content_hash,
|
content_hash=content_hash,
|
||||||
unique_identifier_hash=unique_identifier_hash,
|
unique_identifier_hash=primary_hash,
|
||||||
blocknote_document=blocknote_json,
|
blocknote_document=blocknote_json,
|
||||||
content_needs_reindexing=False,
|
content_needs_reindexing=False,
|
||||||
updated_at=get_current_timestamp(),
|
updated_at=get_current_timestamp(),
|
||||||
|
|
@ -419,6 +594,7 @@ async def add_received_file_document_using_docling(
|
||||||
docling_markdown_document: str,
|
docling_markdown_document: str,
|
||||||
search_space_id: int,
|
search_space_id: int,
|
||||||
user_id: str,
|
user_id: str,
|
||||||
|
connector: dict | None = None,
|
||||||
) -> Document | None:
|
) -> Document | None:
|
||||||
"""
|
"""
|
||||||
Process and store document content parsed by Docling.
|
Process and store document content parsed by Docling.
|
||||||
|
|
@ -429,6 +605,7 @@ async def add_received_file_document_using_docling(
|
||||||
docling_markdown_document: Markdown content from Docling parsing
|
docling_markdown_document: Markdown content from Docling parsing
|
||||||
search_space_id: ID of the search space
|
search_space_id: ID of the search space
|
||||||
user_id: ID of the user
|
user_id: ID of the user
|
||||||
|
connector: Optional connector info for Google Drive files
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
Document object if successful, None if failed
|
Document object if successful, None if failed
|
||||||
|
|
@ -436,35 +613,38 @@ async def add_received_file_document_using_docling(
|
||||||
try:
|
try:
|
||||||
file_in_markdown = docling_markdown_document
|
file_in_markdown = docling_markdown_document
|
||||||
|
|
||||||
# Generate unique identifier hash for this file
|
# Generate unique identifier hash (uses file_id for Google Drive, filename for others)
|
||||||
unique_identifier_hash = generate_unique_identifier_hash(
|
primary_hash, legacy_hash = get_google_drive_unique_identifier(
|
||||||
DocumentType.FILE, file_name, search_space_id
|
connector, file_name, search_space_id
|
||||||
)
|
)
|
||||||
|
|
||||||
# Generate content hash
|
# Generate content hash
|
||||||
content_hash = generate_content_hash(file_in_markdown, search_space_id)
|
content_hash = generate_content_hash(file_in_markdown, search_space_id)
|
||||||
|
|
||||||
# Check if document with this unique identifier already exists
|
# Check if document exists (with migration support for Google Drive and content_hash fallback)
|
||||||
existing_document = await check_document_by_unique_identifier(
|
existing_document = await find_existing_document_with_migration(
|
||||||
session, unique_identifier_hash
|
session, primary_hash, legacy_hash, content_hash
|
||||||
)
|
)
|
||||||
|
|
||||||
if existing_document:
|
if existing_document:
|
||||||
# Document exists - check if content has changed
|
# Handle existing document (rename detection, content change check)
|
||||||
if existing_document.content_hash == content_hash:
|
should_skip, doc = await handle_existing_document_update(
|
||||||
logging.info(f"Document for file {file_name} unchanged. Skipping.")
|
session,
|
||||||
return existing_document
|
existing_document,
|
||||||
else:
|
content_hash,
|
||||||
# Content has changed - update the existing document
|
connector,
|
||||||
logging.info(
|
file_name,
|
||||||
f"Content changed for file {file_name}. Updating document."
|
primary_hash,
|
||||||
)
|
)
|
||||||
|
if should_skip:
|
||||||
|
return doc
|
||||||
|
# Content changed - continue to update
|
||||||
|
|
||||||
# Get user's long context LLM (needed for both create and update)
|
# Get user's long context LLM (needed for both create and update)
|
||||||
user_llm = await get_user_long_context_llm(session, user_id, search_space_id)
|
user_llm = await get_user_long_context_llm(session, user_id, search_space_id)
|
||||||
if not user_llm:
|
if not user_llm:
|
||||||
raise RuntimeError(
|
raise RuntimeError(
|
||||||
f"No long context LLM configured for user {user_id} in search space {search_space_id}"
|
f"No long context LLM configured for user {user_id} in search_space {search_space_id}"
|
||||||
)
|
)
|
||||||
|
|
||||||
# Generate summary using chunked processing for large documents
|
# Generate summary using chunked processing for large documents
|
||||||
|
|
@ -534,10 +714,15 @@ async def add_received_file_document_using_docling(
|
||||||
document = existing_document
|
document = existing_document
|
||||||
else:
|
else:
|
||||||
# Create new document
|
# Create new document
|
||||||
|
# Determine document type based on connector
|
||||||
|
doc_type = DocumentType.FILE
|
||||||
|
if connector and connector.get("type") == DocumentType.GOOGLE_DRIVE_FILE:
|
||||||
|
doc_type = DocumentType.GOOGLE_DRIVE_FILE
|
||||||
|
|
||||||
document = Document(
|
document = Document(
|
||||||
search_space_id=search_space_id,
|
search_space_id=search_space_id,
|
||||||
title=file_name,
|
title=file_name,
|
||||||
document_type=DocumentType.FILE,
|
document_type=doc_type,
|
||||||
document_metadata={
|
document_metadata={
|
||||||
"FILE_NAME": file_name,
|
"FILE_NAME": file_name,
|
||||||
"ETL_SERVICE": "DOCLING",
|
"ETL_SERVICE": "DOCLING",
|
||||||
|
|
@ -546,15 +731,15 @@ async def add_received_file_document_using_docling(
|
||||||
embedding=summary_embedding,
|
embedding=summary_embedding,
|
||||||
chunks=chunks,
|
chunks=chunks,
|
||||||
content_hash=content_hash,
|
content_hash=content_hash,
|
||||||
unique_identifier_hash=unique_identifier_hash,
|
unique_identifier_hash=primary_hash,
|
||||||
blocknote_document=blocknote_json,
|
blocknote_document=blocknote_json,
|
||||||
content_needs_reindexing=False,
|
content_needs_reindexing=False,
|
||||||
updated_at=get_current_timestamp(),
|
updated_at=get_current_timestamp(),
|
||||||
)
|
)
|
||||||
|
|
||||||
session.add(document)
|
session.add(document)
|
||||||
await session.commit()
|
await session.commit()
|
||||||
await session.refresh(document)
|
await session.refresh(document)
|
||||||
|
|
||||||
return document
|
return document
|
||||||
except SQLAlchemyError as db_error:
|
except SQLAlchemyError as db_error:
|
||||||
|
|
@ -650,7 +835,7 @@ async def process_file_in_background(
|
||||||
|
|
||||||
# Process markdown directly through specialized function
|
# Process markdown directly through specialized function
|
||||||
result = await add_received_markdown_file_document(
|
result = await add_received_markdown_file_document(
|
||||||
session, filename, markdown_content, search_space_id, user_id
|
session, filename, markdown_content, search_space_id, user_id, connector
|
||||||
)
|
)
|
||||||
|
|
||||||
if connector:
|
if connector:
|
||||||
|
|
@ -790,7 +975,7 @@ async def process_file_in_background(
|
||||||
|
|
||||||
# Process transcription as markdown document
|
# Process transcription as markdown document
|
||||||
result = await add_received_markdown_file_document(
|
result = await add_received_markdown_file_document(
|
||||||
session, filename, transcribed_text, search_space_id, user_id
|
session, filename, transcribed_text, search_space_id, user_id, connector
|
||||||
)
|
)
|
||||||
|
|
||||||
if connector:
|
if connector:
|
||||||
|
|
@ -955,7 +1140,7 @@ async def process_file_in_background(
|
||||||
|
|
||||||
# Pass the documents to the existing background task
|
# Pass the documents to the existing background task
|
||||||
result = await add_received_file_document_using_unstructured(
|
result = await add_received_file_document_using_unstructured(
|
||||||
session, filename, docs, search_space_id, user_id
|
session, filename, docs, search_space_id, user_id, connector
|
||||||
)
|
)
|
||||||
|
|
||||||
if connector:
|
if connector:
|
||||||
|
|
@ -1103,6 +1288,7 @@ async def process_file_in_background(
|
||||||
llamacloud_markdown_document=markdown_content,
|
llamacloud_markdown_document=markdown_content,
|
||||||
search_space_id=search_space_id,
|
search_space_id=search_space_id,
|
||||||
user_id=user_id,
|
user_id=user_id,
|
||||||
|
connector=connector,
|
||||||
)
|
)
|
||||||
|
|
||||||
# Track if this document was successfully created
|
# Track if this document was successfully created
|
||||||
|
|
@ -1256,6 +1442,7 @@ async def process_file_in_background(
|
||||||
docling_markdown_document=result["content"],
|
docling_markdown_document=result["content"],
|
||||||
search_space_id=search_space_id,
|
search_space_id=search_space_id,
|
||||||
user_id=user_id,
|
user_id=user_id,
|
||||||
|
connector=connector,
|
||||||
)
|
)
|
||||||
|
|
||||||
if doc_result:
|
if doc_result:
|
||||||
|
|
|
||||||
|
|
@ -19,16 +19,157 @@ from app.utils.document_converters import (
|
||||||
|
|
||||||
from .base import (
|
from .base import (
|
||||||
check_document_by_unique_identifier,
|
check_document_by_unique_identifier,
|
||||||
|
check_duplicate_document,
|
||||||
get_current_timestamp,
|
get_current_timestamp,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _get_google_drive_unique_identifier(
|
||||||
|
connector: dict | None,
|
||||||
|
filename: str,
|
||||||
|
search_space_id: int,
|
||||||
|
) -> tuple[str, str | None]:
|
||||||
|
"""
|
||||||
|
Get unique identifier hash for a file, with special handling for Google Drive.
|
||||||
|
|
||||||
|
For Google Drive files, uses file_id as the unique identifier (doesn't change on rename).
|
||||||
|
For other files, uses filename.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
connector: Optional connector info dict with type and metadata
|
||||||
|
filename: The filename (used for non-Google Drive files or as fallback)
|
||||||
|
search_space_id: The search space ID
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Tuple of (primary_hash, legacy_hash or None)
|
||||||
|
"""
|
||||||
|
if connector and connector.get("type") == DocumentType.GOOGLE_DRIVE_FILE:
|
||||||
|
metadata = connector.get("metadata", {})
|
||||||
|
file_id = metadata.get("google_drive_file_id")
|
||||||
|
|
||||||
|
if file_id:
|
||||||
|
primary_hash = generate_unique_identifier_hash(
|
||||||
|
DocumentType.GOOGLE_DRIVE_FILE, file_id, search_space_id
|
||||||
|
)
|
||||||
|
legacy_hash = generate_unique_identifier_hash(
|
||||||
|
DocumentType.GOOGLE_DRIVE_FILE, filename, search_space_id
|
||||||
|
)
|
||||||
|
return primary_hash, legacy_hash
|
||||||
|
|
||||||
|
primary_hash = generate_unique_identifier_hash(
|
||||||
|
DocumentType.FILE, filename, search_space_id
|
||||||
|
)
|
||||||
|
return primary_hash, None
|
||||||
|
|
||||||
|
|
||||||
|
async def _find_existing_document_with_migration(
|
||||||
|
session: AsyncSession,
|
||||||
|
primary_hash: str,
|
||||||
|
legacy_hash: str | None,
|
||||||
|
content_hash: str | None = None,
|
||||||
|
) -> Document | None:
|
||||||
|
"""
|
||||||
|
Find existing document, checking both new hash and legacy hash for migration,
|
||||||
|
with fallback to content_hash for cross-source deduplication.
|
||||||
|
"""
|
||||||
|
existing_document = await check_document_by_unique_identifier(session, primary_hash)
|
||||||
|
|
||||||
|
if not existing_document and legacy_hash:
|
||||||
|
existing_document = await check_document_by_unique_identifier(
|
||||||
|
session, legacy_hash
|
||||||
|
)
|
||||||
|
if existing_document:
|
||||||
|
logging.info(
|
||||||
|
"Found legacy document (filename-based hash), will migrate to file_id-based hash"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Fallback: check by content_hash to catch duplicates from different sources
|
||||||
|
if not existing_document and content_hash:
|
||||||
|
existing_document = await check_duplicate_document(session, content_hash)
|
||||||
|
if existing_document:
|
||||||
|
logging.info(
|
||||||
|
f"Found duplicate content from different source (content_hash match). "
|
||||||
|
f"Original document ID: {existing_document.id}, type: {existing_document.document_type}"
|
||||||
|
)
|
||||||
|
|
||||||
|
return existing_document
|
||||||
|
|
||||||
|
|
||||||
|
async def _handle_existing_document_update(
|
||||||
|
session: AsyncSession,
|
||||||
|
existing_document: Document,
|
||||||
|
content_hash: str,
|
||||||
|
connector: dict | None,
|
||||||
|
filename: str,
|
||||||
|
primary_hash: str,
|
||||||
|
task_logger: TaskLoggingService,
|
||||||
|
log_entry,
|
||||||
|
) -> tuple[bool, Document | None]:
|
||||||
|
"""
|
||||||
|
Handle update logic for an existing document.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Tuple of (should_skip_processing, document_to_return)
|
||||||
|
"""
|
||||||
|
# Check if this document needs hash migration
|
||||||
|
if existing_document.unique_identifier_hash != primary_hash:
|
||||||
|
existing_document.unique_identifier_hash = primary_hash
|
||||||
|
logging.info(f"Migrated document to file_id-based identifier: {filename}")
|
||||||
|
|
||||||
|
# Check if content has changed
|
||||||
|
if existing_document.content_hash == content_hash:
|
||||||
|
# Content unchanged - check if we need to update metadata (e.g., filename changed)
|
||||||
|
if connector and connector.get("type") == DocumentType.GOOGLE_DRIVE_FILE:
|
||||||
|
connector_metadata = connector.get("metadata", {})
|
||||||
|
new_name = connector_metadata.get("google_drive_file_name")
|
||||||
|
# Check both possible keys for old name (FILE_NAME is used in stored documents)
|
||||||
|
doc_metadata = existing_document.document_metadata or {}
|
||||||
|
old_name = (
|
||||||
|
doc_metadata.get("FILE_NAME")
|
||||||
|
or doc_metadata.get("google_drive_file_name")
|
||||||
|
or doc_metadata.get("file_name")
|
||||||
|
)
|
||||||
|
|
||||||
|
if new_name and old_name and old_name != new_name:
|
||||||
|
# File was renamed - update title and metadata, skip expensive processing
|
||||||
|
from sqlalchemy.orm.attributes import flag_modified
|
||||||
|
|
||||||
|
existing_document.title = new_name
|
||||||
|
if not existing_document.document_metadata:
|
||||||
|
existing_document.document_metadata = {}
|
||||||
|
existing_document.document_metadata["FILE_NAME"] = new_name
|
||||||
|
existing_document.document_metadata["file_name"] = new_name
|
||||||
|
existing_document.document_metadata["google_drive_file_name"] = new_name
|
||||||
|
flag_modified(existing_document, "document_metadata")
|
||||||
|
await session.commit()
|
||||||
|
logging.info(
|
||||||
|
f"File renamed in Google Drive: '{old_name}' → '{new_name}' (no re-processing needed)"
|
||||||
|
)
|
||||||
|
|
||||||
|
await task_logger.log_task_success(
|
||||||
|
log_entry,
|
||||||
|
f"Markdown file document unchanged: {filename}",
|
||||||
|
{
|
||||||
|
"duplicate_detected": True,
|
||||||
|
"existing_document_id": existing_document.id,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
logging.info(f"Document for markdown file {filename} unchanged. Skipping.")
|
||||||
|
return True, existing_document
|
||||||
|
else:
|
||||||
|
logging.info(
|
||||||
|
f"Content changed for markdown file {filename}. Updating document."
|
||||||
|
)
|
||||||
|
return False, None
|
||||||
|
|
||||||
|
|
||||||
async def add_received_markdown_file_document(
|
async def add_received_markdown_file_document(
|
||||||
session: AsyncSession,
|
session: AsyncSession,
|
||||||
file_name: str,
|
file_name: str,
|
||||||
file_in_markdown: str,
|
file_in_markdown: str,
|
||||||
search_space_id: int,
|
search_space_id: int,
|
||||||
user_id: str,
|
user_id: str,
|
||||||
|
connector: dict | None = None,
|
||||||
) -> Document | None:
|
) -> Document | None:
|
||||||
"""
|
"""
|
||||||
Process and store a markdown file document.
|
Process and store a markdown file document.
|
||||||
|
|
@ -39,6 +180,7 @@ async def add_received_markdown_file_document(
|
||||||
file_in_markdown: Content of the markdown file
|
file_in_markdown: Content of the markdown file
|
||||||
search_space_id: ID of the search space
|
search_space_id: ID of the search space
|
||||||
user_id: ID of the user
|
user_id: ID of the user
|
||||||
|
connector: Optional connector info for Google Drive files
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
Document object if successful, None if failed
|
Document object if successful, None if failed
|
||||||
|
|
@ -58,39 +200,34 @@ async def add_received_markdown_file_document(
|
||||||
)
|
)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Generate unique identifier hash for this markdown file
|
# Generate unique identifier hash (uses file_id for Google Drive, filename for others)
|
||||||
unique_identifier_hash = generate_unique_identifier_hash(
|
primary_hash, legacy_hash = _get_google_drive_unique_identifier(
|
||||||
DocumentType.FILE, file_name, search_space_id
|
connector, file_name, search_space_id
|
||||||
)
|
)
|
||||||
|
|
||||||
# Generate content hash
|
# Generate content hash
|
||||||
content_hash = generate_content_hash(file_in_markdown, search_space_id)
|
content_hash = generate_content_hash(file_in_markdown, search_space_id)
|
||||||
|
|
||||||
# Check if document with this unique identifier already exists
|
# Check if document exists (with migration support for Google Drive and content_hash fallback)
|
||||||
existing_document = await check_document_by_unique_identifier(
|
existing_document = await _find_existing_document_with_migration(
|
||||||
session, unique_identifier_hash
|
session, primary_hash, legacy_hash, content_hash
|
||||||
)
|
)
|
||||||
|
|
||||||
if existing_document:
|
if existing_document:
|
||||||
# Document exists - check if content has changed
|
# Handle existing document (rename detection, content change check)
|
||||||
if existing_document.content_hash == content_hash:
|
should_skip, doc = await _handle_existing_document_update(
|
||||||
await task_logger.log_task_success(
|
session,
|
||||||
log_entry,
|
existing_document,
|
||||||
f"Markdown file document unchanged: {file_name}",
|
content_hash,
|
||||||
{
|
connector,
|
||||||
"duplicate_detected": True,
|
file_name,
|
||||||
"existing_document_id": existing_document.id,
|
primary_hash,
|
||||||
},
|
task_logger,
|
||||||
)
|
log_entry,
|
||||||
logging.info(
|
)
|
||||||
f"Document for markdown file {file_name} unchanged. Skipping."
|
if should_skip:
|
||||||
)
|
return doc
|
||||||
return existing_document
|
# Content changed - continue to update
|
||||||
else:
|
|
||||||
# Content has changed - update the existing document
|
|
||||||
logging.info(
|
|
||||||
f"Content changed for markdown file {file_name}. Updating document."
|
|
||||||
)
|
|
||||||
|
|
||||||
# Get user's long context LLM (needed for both create and update)
|
# Get user's long context LLM (needed for both create and update)
|
||||||
user_llm = await get_user_long_context_llm(session, user_id, search_space_id)
|
user_llm = await get_user_long_context_llm(session, user_id, search_space_id)
|
||||||
|
|
@ -139,10 +276,15 @@ async def add_received_markdown_file_document(
|
||||||
document = existing_document
|
document = existing_document
|
||||||
else:
|
else:
|
||||||
# Create new document
|
# Create new document
|
||||||
|
# Determine document type based on connector
|
||||||
|
doc_type = DocumentType.FILE
|
||||||
|
if connector and connector.get("type") == DocumentType.GOOGLE_DRIVE_FILE:
|
||||||
|
doc_type = DocumentType.GOOGLE_DRIVE_FILE
|
||||||
|
|
||||||
document = Document(
|
document = Document(
|
||||||
search_space_id=search_space_id,
|
search_space_id=search_space_id,
|
||||||
title=file_name,
|
title=file_name,
|
||||||
document_type=DocumentType.FILE,
|
document_type=doc_type,
|
||||||
document_metadata={
|
document_metadata={
|
||||||
"FILE_NAME": file_name,
|
"FILE_NAME": file_name,
|
||||||
},
|
},
|
||||||
|
|
@ -150,7 +292,7 @@ async def add_received_markdown_file_document(
|
||||||
embedding=summary_embedding,
|
embedding=summary_embedding,
|
||||||
chunks=chunks,
|
chunks=chunks,
|
||||||
content_hash=content_hash,
|
content_hash=content_hash,
|
||||||
unique_identifier_hash=unique_identifier_hash,
|
unique_identifier_hash=primary_hash,
|
||||||
blocknote_document=blocknote_json,
|
blocknote_document=blocknote_json,
|
||||||
updated_at=get_current_timestamp(),
|
updated_at=get_current_timestamp(),
|
||||||
)
|
)
|
||||||
|
|
|
||||||
|
|
@ -1,6 +1,7 @@
|
||||||
"use client";
|
"use client";
|
||||||
|
|
||||||
import type { FC } from "react";
|
import type { FC } from "react";
|
||||||
|
import { AlertCircle } from "lucide-react";
|
||||||
import { Label } from "@/components/ui/label";
|
import { Label } from "@/components/ui/label";
|
||||||
import {
|
import {
|
||||||
Select,
|
Select,
|
||||||
|
|
@ -16,6 +17,8 @@ interface PeriodicSyncConfigProps {
|
||||||
frequencyMinutes: string;
|
frequencyMinutes: string;
|
||||||
onEnabledChange: (enabled: boolean) => void;
|
onEnabledChange: (enabled: boolean) => void;
|
||||||
onFrequencyChange: (frequency: string) => void;
|
onFrequencyChange: (frequency: string) => void;
|
||||||
|
disabled?: boolean;
|
||||||
|
disabledMessage?: string;
|
||||||
}
|
}
|
||||||
|
|
||||||
export const PeriodicSyncConfig: FC<PeriodicSyncConfigProps> = ({
|
export const PeriodicSyncConfig: FC<PeriodicSyncConfigProps> = ({
|
||||||
|
|
@ -23,6 +26,8 @@ export const PeriodicSyncConfig: FC<PeriodicSyncConfigProps> = ({
|
||||||
frequencyMinutes,
|
frequencyMinutes,
|
||||||
onEnabledChange,
|
onEnabledChange,
|
||||||
onFrequencyChange,
|
onFrequencyChange,
|
||||||
|
disabled = false,
|
||||||
|
disabledMessage,
|
||||||
}) => {
|
}) => {
|
||||||
return (
|
return (
|
||||||
<div className="rounded-xl bg-slate-400/5 dark:bg-white/5 p-3 sm:p-6">
|
<div className="rounded-xl bg-slate-400/5 dark:bg-white/5 p-3 sm:p-6">
|
||||||
|
|
@ -33,9 +38,17 @@ export const PeriodicSyncConfig: FC<PeriodicSyncConfigProps> = ({
|
||||||
Automatically re-index at regular intervals
|
Automatically re-index at regular intervals
|
||||||
</p>
|
</p>
|
||||||
</div>
|
</div>
|
||||||
<Switch checked={enabled} onCheckedChange={onEnabledChange} />
|
<Switch checked={enabled} onCheckedChange={onEnabledChange} disabled={disabled} />
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
|
{/* Show disabled message when periodic sync can't be enabled */}
|
||||||
|
{disabled && disabledMessage && (
|
||||||
|
<div className="mt-3 flex items-start gap-2 text-amber-600 dark:text-amber-400">
|
||||||
|
<AlertCircle className="size-4 mt-0.5 shrink-0" />
|
||||||
|
<p className="text-xs sm:text-sm">{disabledMessage}</p>
|
||||||
|
</div>
|
||||||
|
)}
|
||||||
|
|
||||||
{enabled && (
|
{enabled && (
|
||||||
<div className="mt-4 pt-4 border-t border-slate-400/20 space-y-3">
|
<div className="mt-4 pt-4 border-t border-slate-400/20 space-y-3">
|
||||||
<div className="space-y-2">
|
<div className="space-y-2">
|
||||||
|
|
|
||||||
|
|
@ -1,11 +1,19 @@
|
||||||
"use client";
|
"use client";
|
||||||
|
|
||||||
import { Info } from "lucide-react";
|
import { File, FileText, FileSpreadsheet, FolderClosed, Image, Presentation } from "lucide-react";
|
||||||
import type { FC } from "react";
|
import type { FC } from "react";
|
||||||
import { useEffect, useState } from "react";
|
import { useEffect, useState } from "react";
|
||||||
import { GoogleDriveFolderTree } from "@/components/connectors/google-drive-folder-tree";
|
import { GoogleDriveFolderTree } from "@/components/connectors/google-drive-folder-tree";
|
||||||
import { Alert, AlertDescription } from "@/components/ui/alert";
|
|
||||||
import { Button } from "@/components/ui/button";
|
import { Button } from "@/components/ui/button";
|
||||||
|
import { Label } from "@/components/ui/label";
|
||||||
|
import {
|
||||||
|
Select,
|
||||||
|
SelectContent,
|
||||||
|
SelectItem,
|
||||||
|
SelectTrigger,
|
||||||
|
SelectValue,
|
||||||
|
} from "@/components/ui/select";
|
||||||
|
import { Switch } from "@/components/ui/switch";
|
||||||
import type { ConnectorConfigProps } from "../index";
|
import type { ConnectorConfigProps } from "../index";
|
||||||
|
|
||||||
interface SelectedFolder {
|
interface SelectedFolder {
|
||||||
|
|
@ -13,128 +21,292 @@ interface SelectedFolder {
|
||||||
name: string;
|
name: string;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
interface IndexingOptions {
|
||||||
|
max_files_per_folder: number;
|
||||||
|
incremental_sync: boolean;
|
||||||
|
include_subfolders: boolean;
|
||||||
|
}
|
||||||
|
|
||||||
|
const DEFAULT_INDEXING_OPTIONS: IndexingOptions = {
|
||||||
|
max_files_per_folder: 100,
|
||||||
|
incremental_sync: true,
|
||||||
|
include_subfolders: true,
|
||||||
|
};
|
||||||
|
|
||||||
|
// Helper to get appropriate icon for file type based on file name
|
||||||
|
function getFileIconFromName(fileName: string, className: string = "size-3.5 shrink-0") {
|
||||||
|
const lowerName = fileName.toLowerCase();
|
||||||
|
// Spreadsheets
|
||||||
|
if (
|
||||||
|
lowerName.endsWith(".xlsx") ||
|
||||||
|
lowerName.endsWith(".xls") ||
|
||||||
|
lowerName.endsWith(".csv") ||
|
||||||
|
lowerName.includes("spreadsheet")
|
||||||
|
) {
|
||||||
|
return <FileSpreadsheet className={`${className} text-green-500`} />;
|
||||||
|
}
|
||||||
|
// Presentations
|
||||||
|
if (
|
||||||
|
lowerName.endsWith(".pptx") ||
|
||||||
|
lowerName.endsWith(".ppt") ||
|
||||||
|
lowerName.includes("presentation")
|
||||||
|
) {
|
||||||
|
return <Presentation className={`${className} text-orange-500`} />;
|
||||||
|
}
|
||||||
|
// Documents (word, text only - not PDF)
|
||||||
|
if (
|
||||||
|
lowerName.endsWith(".docx") ||
|
||||||
|
lowerName.endsWith(".doc") ||
|
||||||
|
lowerName.endsWith(".txt") ||
|
||||||
|
lowerName.includes("document") ||
|
||||||
|
lowerName.includes("word") ||
|
||||||
|
lowerName.includes("text")
|
||||||
|
) {
|
||||||
|
return <FileText className={`${className} text-gray-500`} />;
|
||||||
|
}
|
||||||
|
// Images
|
||||||
|
if (
|
||||||
|
lowerName.endsWith(".png") ||
|
||||||
|
lowerName.endsWith(".jpg") ||
|
||||||
|
lowerName.endsWith(".jpeg") ||
|
||||||
|
lowerName.endsWith(".gif") ||
|
||||||
|
lowerName.endsWith(".webp") ||
|
||||||
|
lowerName.endsWith(".svg")
|
||||||
|
) {
|
||||||
|
return <Image className={`${className} text-purple-500`} />;
|
||||||
|
}
|
||||||
|
// Default (including PDF)
|
||||||
|
return <File className={`${className} text-gray-500`} />;
|
||||||
|
}
|
||||||
|
|
||||||
export const GoogleDriveConfig: FC<ConnectorConfigProps> = ({ connector, onConfigChange }) => {
|
export const GoogleDriveConfig: FC<ConnectorConfigProps> = ({ connector, onConfigChange }) => {
|
||||||
// Initialize with existing selected folders and files from connector config
|
// Initialize with existing selected folders and files from connector config
|
||||||
const existingFolders =
|
const existingFolders =
|
||||||
(connector.config?.selected_folders as SelectedFolder[] | undefined) || [];
|
(connector.config?.selected_folders as SelectedFolder[] | undefined) || [];
|
||||||
const existingFiles = (connector.config?.selected_files as SelectedFolder[] | undefined) || [];
|
const existingFiles = (connector.config?.selected_files as SelectedFolder[] | undefined) || [];
|
||||||
|
const existingIndexingOptions =
|
||||||
|
(connector.config?.indexing_options as IndexingOptions | undefined) || DEFAULT_INDEXING_OPTIONS;
|
||||||
|
|
||||||
const [selectedFolders, setSelectedFolders] = useState<SelectedFolder[]>(existingFolders);
|
const [selectedFolders, setSelectedFolders] = useState<SelectedFolder[]>(existingFolders);
|
||||||
const [selectedFiles, setSelectedFiles] = useState<SelectedFolder[]>(existingFiles);
|
const [selectedFiles, setSelectedFiles] = useState<SelectedFolder[]>(existingFiles);
|
||||||
const [showFolderSelector, setShowFolderSelector] = useState(false);
|
const [showFolderSelector, setShowFolderSelector] = useState(false);
|
||||||
|
const [indexingOptions, setIndexingOptions] = useState<IndexingOptions>(existingIndexingOptions);
|
||||||
|
|
||||||
// Update selected folders and files when connector config changes
|
// Update selected folders and files when connector config changes
|
||||||
useEffect(() => {
|
useEffect(() => {
|
||||||
const folders = (connector.config?.selected_folders as SelectedFolder[] | undefined) || [];
|
const folders = (connector.config?.selected_folders as SelectedFolder[] | undefined) || [];
|
||||||
const files = (connector.config?.selected_files as SelectedFolder[] | undefined) || [];
|
const files = (connector.config?.selected_files as SelectedFolder[] | undefined) || [];
|
||||||
|
const options =
|
||||||
|
(connector.config?.indexing_options as IndexingOptions | undefined) ||
|
||||||
|
DEFAULT_INDEXING_OPTIONS;
|
||||||
setSelectedFolders(folders);
|
setSelectedFolders(folders);
|
||||||
setSelectedFiles(files);
|
setSelectedFiles(files);
|
||||||
|
setIndexingOptions(options);
|
||||||
}, [connector.config]);
|
}, [connector.config]);
|
||||||
|
|
||||||
const handleSelectFolders = (folders: SelectedFolder[]) => {
|
const updateConfig = (
|
||||||
setSelectedFolders(folders);
|
folders: SelectedFolder[],
|
||||||
|
files: SelectedFolder[],
|
||||||
|
options: IndexingOptions
|
||||||
|
) => {
|
||||||
if (onConfigChange) {
|
if (onConfigChange) {
|
||||||
// Store folder IDs and names in config for indexing
|
|
||||||
onConfigChange({
|
onConfigChange({
|
||||||
...connector.config,
|
...connector.config,
|
||||||
selected_folders: folders,
|
selected_folders: folders,
|
||||||
selected_files: selectedFiles, // Preserve existing files
|
selected_files: files,
|
||||||
|
indexing_options: options,
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
const handleSelectFolders = (folders: SelectedFolder[]) => {
|
||||||
|
setSelectedFolders(folders);
|
||||||
|
updateConfig(folders, selectedFiles, indexingOptions);
|
||||||
|
};
|
||||||
|
|
||||||
const handleSelectFiles = (files: SelectedFolder[]) => {
|
const handleSelectFiles = (files: SelectedFolder[]) => {
|
||||||
setSelectedFiles(files);
|
setSelectedFiles(files);
|
||||||
if (onConfigChange) {
|
updateConfig(selectedFolders, files, indexingOptions);
|
||||||
// Store file IDs and names in config for indexing
|
};
|
||||||
onConfigChange({
|
|
||||||
...connector.config,
|
const handleIndexingOptionChange = (key: keyof IndexingOptions, value: number | boolean) => {
|
||||||
selected_folders: selectedFolders, // Preserve existing folders
|
const newOptions = { ...indexingOptions, [key]: value };
|
||||||
selected_files: files,
|
setIndexingOptions(newOptions);
|
||||||
});
|
updateConfig(selectedFolders, selectedFiles, newOptions);
|
||||||
}
|
|
||||||
};
|
};
|
||||||
|
|
||||||
const totalSelected = selectedFolders.length + selectedFiles.length;
|
const totalSelected = selectedFolders.length + selectedFiles.length;
|
||||||
|
|
||||||
return (
|
return (
|
||||||
<div className="rounded-xl border border-border bg-slate-400/5 dark:bg-white/5 p-3 sm:p-6 space-y-3 sm:space-y-4">
|
<div className="space-y-4">
|
||||||
<div className="space-y-1 sm:space-y-2">
|
{/* Folder & File Selection */}
|
||||||
<h3 className="font-medium text-sm sm:text-base">Folder & File Selection</h3>
|
<div className="rounded-xl border border-border bg-slate-400/5 dark:bg-white/5 p-3 sm:p-6 space-y-3 sm:space-y-4">
|
||||||
<p className="text-xs sm:text-sm text-muted-foreground">
|
<div className="space-y-1 sm:space-y-2">
|
||||||
Select specific folders and/or individual files to index. Only files directly in each
|
<h3 className="font-medium text-sm sm:text-base">Folder & File Selection</h3>
|
||||||
folder will be processed—subfolders must be selected separately.
|
<p className="text-xs sm:text-sm text-muted-foreground">
|
||||||
</p>
|
Select specific folders and/or individual files to index.
|
||||||
</div>
|
|
||||||
|
|
||||||
{totalSelected > 0 && (
|
|
||||||
<div className="p-2 sm:p-3 bg-muted rounded-lg text-xs sm:text-sm space-y-1 sm:space-y-2">
|
|
||||||
<p className="font-medium">
|
|
||||||
Selected {totalSelected} item{totalSelected > 1 ? "s" : ""}:
|
|
||||||
{selectedFolders.length > 0 &&
|
|
||||||
` ${selectedFolders.length} folder${selectedFolders.length > 1 ? "s" : ""}`}
|
|
||||||
{selectedFiles.length > 0 &&
|
|
||||||
` ${selectedFiles.length} file${selectedFiles.length > 1 ? "s" : ""}`}
|
|
||||||
</p>
|
</p>
|
||||||
<div className="max-h-20 sm:max-h-24 overflow-y-auto space-y-1">
|
|
||||||
{selectedFolders.map((folder) => (
|
|
||||||
<p
|
|
||||||
key={folder.id}
|
|
||||||
className="text-xs sm:text-sm text-muted-foreground truncate"
|
|
||||||
title={folder.name}
|
|
||||||
>
|
|
||||||
📁 {folder.name}
|
|
||||||
</p>
|
|
||||||
))}
|
|
||||||
{selectedFiles.map((file) => (
|
|
||||||
<p
|
|
||||||
key={file.id}
|
|
||||||
className="text-xs sm:text-sm text-muted-foreground truncate"
|
|
||||||
title={file.name}
|
|
||||||
>
|
|
||||||
📄 {file.name}
|
|
||||||
</p>
|
|
||||||
))}
|
|
||||||
</div>
|
|
||||||
</div>
|
</div>
|
||||||
)}
|
|
||||||
|
|
||||||
{showFolderSelector ? (
|
{totalSelected > 0 && (
|
||||||
<div className="space-y-2 sm:space-y-3">
|
<div className="p-2 sm:p-3 bg-muted rounded-lg text-xs sm:text-sm space-y-1 sm:space-y-2">
|
||||||
<GoogleDriveFolderTree
|
<p className="font-medium">
|
||||||
connectorId={connector.id}
|
Selected {totalSelected} item{totalSelected > 1 ? "s" : ""}: {(() => {
|
||||||
selectedFolders={selectedFolders}
|
const parts: string[] = [];
|
||||||
onSelectFolders={handleSelectFolders}
|
if (selectedFolders.length > 0) {
|
||||||
selectedFiles={selectedFiles}
|
parts.push(
|
||||||
onSelectFiles={handleSelectFiles}
|
`${selectedFolders.length} folder${selectedFolders.length > 1 ? "s" : ""}`
|
||||||
/>
|
);
|
||||||
|
}
|
||||||
|
if (selectedFiles.length > 0) {
|
||||||
|
parts.push(`${selectedFiles.length} file${selectedFiles.length > 1 ? "s" : ""}`);
|
||||||
|
}
|
||||||
|
return parts.length > 0 ? `(${parts.join(" ")})` : "";
|
||||||
|
})()}
|
||||||
|
</p>
|
||||||
|
<div className="max-h-20 sm:max-h-24 overflow-y-auto space-y-1">
|
||||||
|
{selectedFolders.map((folder) => (
|
||||||
|
<p
|
||||||
|
key={folder.id}
|
||||||
|
className="text-xs sm:text-sm text-muted-foreground truncate flex items-center gap-1.5"
|
||||||
|
title={folder.name}
|
||||||
|
>
|
||||||
|
<FolderClosed className="size-3.5 shrink-0 text-gray-500" />
|
||||||
|
{folder.name}
|
||||||
|
</p>
|
||||||
|
))}
|
||||||
|
{selectedFiles.map((file) => (
|
||||||
|
<p
|
||||||
|
key={file.id}
|
||||||
|
className="text-xs sm:text-sm text-muted-foreground truncate flex items-center gap-1.5"
|
||||||
|
title={file.name}
|
||||||
|
>
|
||||||
|
{getFileIconFromName(file.name)}
|
||||||
|
{file.name}
|
||||||
|
</p>
|
||||||
|
))}
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
)}
|
||||||
|
|
||||||
|
{showFolderSelector ? (
|
||||||
|
<div className="space-y-2 sm:space-y-3">
|
||||||
|
<GoogleDriveFolderTree
|
||||||
|
connectorId={connector.id}
|
||||||
|
selectedFolders={selectedFolders}
|
||||||
|
onSelectFolders={handleSelectFolders}
|
||||||
|
selectedFiles={selectedFiles}
|
||||||
|
onSelectFiles={handleSelectFiles}
|
||||||
|
/>
|
||||||
|
<Button
|
||||||
|
type="button"
|
||||||
|
variant="outline"
|
||||||
|
size="sm"
|
||||||
|
onClick={() => setShowFolderSelector(false)}
|
||||||
|
className="bg-slate-400/5 dark:bg-white/5 border-slate-400/20 hover:bg-slate-400/10 dark:hover:bg-white/10 text-xs sm:text-sm h-8 sm:h-9"
|
||||||
|
>
|
||||||
|
Done Selecting
|
||||||
|
</Button>
|
||||||
|
</div>
|
||||||
|
) : (
|
||||||
<Button
|
<Button
|
||||||
type="button"
|
type="button"
|
||||||
variant="outline"
|
variant="outline"
|
||||||
size="sm"
|
onClick={() => setShowFolderSelector(true)}
|
||||||
onClick={() => setShowFolderSelector(false)}
|
|
||||||
className="bg-slate-400/5 dark:bg-white/5 border-slate-400/20 hover:bg-slate-400/10 dark:hover:bg-white/10 text-xs sm:text-sm h-8 sm:h-9"
|
className="bg-slate-400/5 dark:bg-white/5 border-slate-400/20 hover:bg-slate-400/10 dark:hover:bg-white/10 text-xs sm:text-sm h-8 sm:h-9"
|
||||||
>
|
>
|
||||||
Done Selecting
|
{totalSelected > 0 ? "Change Selection" : "Select Folders & Files"}
|
||||||
</Button>
|
</Button>
|
||||||
</div>
|
)}
|
||||||
) : (
|
</div>
|
||||||
<Button
|
|
||||||
type="button"
|
|
||||||
variant="outline"
|
|
||||||
onClick={() => setShowFolderSelector(true)}
|
|
||||||
className="bg-slate-400/5 dark:bg-white/5 border-slate-400/20 hover:bg-slate-400/10 dark:hover:bg-white/10 text-xs sm:text-sm h-8 sm:h-9"
|
|
||||||
>
|
|
||||||
{totalSelected > 0 ? "Change Selection" : "Select Folders & Files"}
|
|
||||||
</Button>
|
|
||||||
)}
|
|
||||||
|
|
||||||
<Alert className="bg-slate-400/5 dark:bg-white/5 border-slate-400/20 p-2 sm:p-3 flex items-center gap-2 [&>svg]:relative [&>svg]:left-0 [&>svg]:top-0 [&>svg+div]:translate-y-0">
|
{/* Indexing Options */}
|
||||||
<Info className="h-3 w-3 sm:h-4 sm:w-4 shrink-0" />
|
<div className="rounded-xl border border-border bg-slate-400/5 dark:bg-white/5 p-3 sm:p-6 space-y-4">
|
||||||
<AlertDescription className="text-[10px] sm:text-xs !pl-0">
|
<div className="space-y-1 sm:space-y-2">
|
||||||
Folder and file selection is used when indexing. You can change this selection when you
|
<h3 className="font-medium text-sm sm:text-base">Indexing Options</h3>
|
||||||
start indexing.
|
<p className="text-xs sm:text-sm text-muted-foreground">
|
||||||
</AlertDescription>
|
Configure how files are indexed from your Google Drive.
|
||||||
</Alert>
|
</p>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
{/* Max files per folder */}
|
||||||
|
<div className="space-y-2">
|
||||||
|
<div className="flex items-center justify-between">
|
||||||
|
<div className="space-y-0.5">
|
||||||
|
<Label htmlFor="max-files" className="text-sm font-medium">
|
||||||
|
Max files per folder
|
||||||
|
</Label>
|
||||||
|
<p className="text-xs text-muted-foreground">
|
||||||
|
Maximum number of files to index from each folder
|
||||||
|
</p>
|
||||||
|
</div>
|
||||||
|
<Select
|
||||||
|
value={indexingOptions.max_files_per_folder.toString()}
|
||||||
|
onValueChange={(value) =>
|
||||||
|
handleIndexingOptionChange("max_files_per_folder", parseInt(value, 10))
|
||||||
|
}
|
||||||
|
>
|
||||||
|
<SelectTrigger
|
||||||
|
id="max-files"
|
||||||
|
className="w-[140px] bg-slate-400/5 dark:bg-slate-400/5 border-slate-400/20 text-xs sm:text-sm"
|
||||||
|
>
|
||||||
|
<SelectValue placeholder="Select limit" />
|
||||||
|
</SelectTrigger>
|
||||||
|
<SelectContent className="z-[100]">
|
||||||
|
<SelectItem value="50" className="text-xs sm:text-sm">
|
||||||
|
50 files
|
||||||
|
</SelectItem>
|
||||||
|
<SelectItem value="100" className="text-xs sm:text-sm">
|
||||||
|
100 files
|
||||||
|
</SelectItem>
|
||||||
|
<SelectItem value="250" className="text-xs sm:text-sm">
|
||||||
|
250 files
|
||||||
|
</SelectItem>
|
||||||
|
<SelectItem value="500" className="text-xs sm:text-sm">
|
||||||
|
500 files
|
||||||
|
</SelectItem>
|
||||||
|
<SelectItem value="1000" className="text-xs sm:text-sm">
|
||||||
|
1000 files
|
||||||
|
</SelectItem>
|
||||||
|
</SelectContent>
|
||||||
|
</Select>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
{/* Incremental sync toggle */}
|
||||||
|
<div className="flex items-center justify-between pt-2 border-t border-slate-400/20">
|
||||||
|
<div className="space-y-0.5">
|
||||||
|
<Label htmlFor="incremental-sync" className="text-sm font-medium">
|
||||||
|
Incremental sync
|
||||||
|
</Label>
|
||||||
|
<p className="text-xs text-muted-foreground">
|
||||||
|
Only sync changes since last index (faster). Disable for a full re-index.
|
||||||
|
</p>
|
||||||
|
</div>
|
||||||
|
<Switch
|
||||||
|
id="incremental-sync"
|
||||||
|
checked={indexingOptions.incremental_sync}
|
||||||
|
onCheckedChange={(checked) => handleIndexingOptionChange("incremental_sync", checked)}
|
||||||
|
/>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
{/* Include subfolders toggle */}
|
||||||
|
<div className="flex items-center justify-between pt-2 border-t border-slate-400/20">
|
||||||
|
<div className="space-y-0.5">
|
||||||
|
<Label htmlFor="include-subfolders" className="text-sm font-medium">
|
||||||
|
Include subfolders
|
||||||
|
</Label>
|
||||||
|
<p className="text-xs text-muted-foreground">
|
||||||
|
Recursively index files in subfolders of selected folders
|
||||||
|
</p>
|
||||||
|
</div>
|
||||||
|
<Switch
|
||||||
|
id="include-subfolders"
|
||||||
|
checked={indexingOptions.include_subfolders}
|
||||||
|
onCheckedChange={(checked) => handleIndexingOptionChange("include_subfolders", checked)}
|
||||||
|
/>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
</div>
|
</div>
|
||||||
);
|
);
|
||||||
};
|
};
|
||||||
|
|
|
||||||
|
|
@ -222,15 +222,36 @@ export const ConnectorEditView: FC<ConnectorEditViewProps> = ({
|
||||||
/>
|
/>
|
||||||
)}
|
)}
|
||||||
|
|
||||||
{/* Periodic sync - not shown for Google Drive */}
|
{/* Periodic sync - shown for all indexable connectors */}
|
||||||
{connector.connector_type !== "GOOGLE_DRIVE_CONNECTOR" && (
|
{(() => {
|
||||||
<PeriodicSyncConfig
|
// Check if Google Drive has folders/files selected
|
||||||
enabled={periodicEnabled}
|
const isGoogleDrive = connector.connector_type === "GOOGLE_DRIVE_CONNECTOR";
|
||||||
frequencyMinutes={frequencyMinutes}
|
const selectedFolders =
|
||||||
onEnabledChange={onPeriodicEnabledChange}
|
(connector.config?.selected_folders as
|
||||||
onFrequencyChange={onFrequencyChange}
|
| Array<{ id: string; name: string }>
|
||||||
/>
|
| undefined) || [];
|
||||||
)}
|
const selectedFiles =
|
||||||
|
(connector.config?.selected_files as
|
||||||
|
| Array<{ id: string; name: string }>
|
||||||
|
| undefined) || [];
|
||||||
|
const hasItemsSelected = selectedFolders.length > 0 || selectedFiles.length > 0;
|
||||||
|
const isDisabled = isGoogleDrive && !hasItemsSelected;
|
||||||
|
|
||||||
|
return (
|
||||||
|
<PeriodicSyncConfig
|
||||||
|
enabled={periodicEnabled}
|
||||||
|
frequencyMinutes={frequencyMinutes}
|
||||||
|
onEnabledChange={onPeriodicEnabledChange}
|
||||||
|
onFrequencyChange={onFrequencyChange}
|
||||||
|
disabled={isDisabled}
|
||||||
|
disabledMessage={
|
||||||
|
isDisabled
|
||||||
|
? "Select at least one folder or file above to enable periodic sync"
|
||||||
|
: undefined
|
||||||
|
}
|
||||||
|
/>
|
||||||
|
);
|
||||||
|
})()}
|
||||||
</>
|
</>
|
||||||
)}
|
)}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -219,11 +219,9 @@ export const useConnectorDialog = () => {
|
||||||
setEditingConnector(connector);
|
setEditingConnector(connector);
|
||||||
setConnectorConfig(connector.config);
|
setConnectorConfig(connector.config);
|
||||||
setConnectorName(connector.name);
|
setConnectorName(connector.name);
|
||||||
// Load existing periodic sync settings (disabled for Google Drive and non-indexable connectors)
|
// Load existing periodic sync settings (disabled for non-indexable connectors)
|
||||||
setPeriodicEnabled(
|
setPeriodicEnabled(
|
||||||
connector.connector_type === "GOOGLE_DRIVE_CONNECTOR" || !connector.is_indexable
|
!connector.is_indexable ? false : connector.periodic_indexing_enabled
|
||||||
? false
|
|
||||||
: connector.periodic_indexing_enabled
|
|
||||||
);
|
);
|
||||||
setFrequencyMinutes(connector.indexing_frequency_minutes?.toString() || "1440");
|
setFrequencyMinutes(connector.indexing_frequency_minutes?.toString() || "1440");
|
||||||
// Reset dates - user can set new ones for re-indexing
|
// Reset dates - user can set new ones for re-indexing
|
||||||
|
|
@ -882,20 +880,14 @@ export const useConnectorDialog = () => {
|
||||||
const endDateStr = endDate ? format(endDate, "yyyy-MM-dd") : undefined;
|
const endDateStr = endDate ? format(endDate, "yyyy-MM-dd") : undefined;
|
||||||
|
|
||||||
// Update connector with periodic sync settings and config changes
|
// Update connector with periodic sync settings and config changes
|
||||||
// Note: Periodic sync is disabled for Google Drive connectors
|
|
||||||
if (periodicEnabled || indexingConnectorConfig) {
|
if (periodicEnabled || indexingConnectorConfig) {
|
||||||
const frequency = periodicEnabled ? parseInt(frequencyMinutes, 10) : undefined;
|
const frequency = periodicEnabled ? parseInt(frequencyMinutes, 10) : undefined;
|
||||||
await updateConnector({
|
await updateConnector({
|
||||||
id: indexingConfig.connectorId,
|
id: indexingConfig.connectorId,
|
||||||
data: {
|
data: {
|
||||||
...(periodicEnabled &&
|
...(periodicEnabled && {
|
||||||
indexingConfig.connectorType !== "GOOGLE_DRIVE_CONNECTOR" && {
|
periodic_indexing_enabled: true,
|
||||||
periodic_indexing_enabled: true,
|
indexing_frequency_minutes: frequency,
|
||||||
indexing_frequency_minutes: frequency,
|
|
||||||
}),
|
|
||||||
...(indexingConfig.connectorType === "GOOGLE_DRIVE_CONNECTOR" && {
|
|
||||||
periodic_indexing_enabled: false,
|
|
||||||
indexing_frequency_minutes: null,
|
|
||||||
}),
|
}),
|
||||||
...(indexingConnectorConfig && {
|
...(indexingConnectorConfig && {
|
||||||
config: indexingConnectorConfig,
|
config: indexingConnectorConfig,
|
||||||
|
|
@ -912,11 +904,18 @@ export const useConnectorDialog = () => {
|
||||||
const selectedFiles = indexingConnectorConfig.selected_files as
|
const selectedFiles = indexingConnectorConfig.selected_files as
|
||||||
| Array<{ id: string; name: string }>
|
| Array<{ id: string; name: string }>
|
||||||
| undefined;
|
| undefined;
|
||||||
|
const indexingOptions = indexingConnectorConfig.indexing_options as
|
||||||
|
| {
|
||||||
|
max_files_per_folder: number;
|
||||||
|
incremental_sync: boolean;
|
||||||
|
include_subfolders: boolean;
|
||||||
|
}
|
||||||
|
| undefined;
|
||||||
if (
|
if (
|
||||||
(selectedFolders && selectedFolders.length > 0) ||
|
(selectedFolders && selectedFolders.length > 0) ||
|
||||||
(selectedFiles && selectedFiles.length > 0)
|
(selectedFiles && selectedFiles.length > 0)
|
||||||
) {
|
) {
|
||||||
// Index with folder/file selection
|
// Index with folder/file selection and indexing options
|
||||||
await indexConnector({
|
await indexConnector({
|
||||||
connector_id: indexingConfig.connectorId,
|
connector_id: indexingConfig.connectorId,
|
||||||
queryParams: {
|
queryParams: {
|
||||||
|
|
@ -925,6 +924,11 @@ export const useConnectorDialog = () => {
|
||||||
body: {
|
body: {
|
||||||
folders: selectedFolders || [],
|
folders: selectedFolders || [],
|
||||||
files: selectedFiles || [],
|
files: selectedFiles || [],
|
||||||
|
indexing_options: indexingOptions || {
|
||||||
|
max_files_per_folder: 100,
|
||||||
|
incremental_sync: true,
|
||||||
|
include_subfolders: true,
|
||||||
|
},
|
||||||
},
|
},
|
||||||
});
|
});
|
||||||
} else {
|
} else {
|
||||||
|
|
@ -964,7 +968,7 @@ export const useConnectorDialog = () => {
|
||||||
);
|
);
|
||||||
|
|
||||||
// Track periodic indexing started if enabled
|
// Track periodic indexing started if enabled
|
||||||
if (periodicEnabled && indexingConfig.connectorType !== "GOOGLE_DRIVE_CONNECTOR") {
|
if (periodicEnabled) {
|
||||||
trackPeriodicIndexingStarted(
|
trackPeriodicIndexingStarted(
|
||||||
Number(searchSpaceId),
|
Number(searchSpaceId),
|
||||||
indexingConfig.connectorType,
|
indexingConfig.connectorType,
|
||||||
|
|
@ -1072,12 +1076,8 @@ export const useConnectorDialog = () => {
|
||||||
|
|
||||||
setEditingConnector(connector);
|
setEditingConnector(connector);
|
||||||
setConnectorName(connector.name);
|
setConnectorName(connector.name);
|
||||||
// Load existing periodic sync settings (disabled for Google Drive and non-indexable connectors)
|
// Load existing periodic sync settings (disabled for non-indexable connectors)
|
||||||
setPeriodicEnabled(
|
setPeriodicEnabled(!connector.is_indexable ? false : connector.periodic_indexing_enabled);
|
||||||
connector.connector_type === "GOOGLE_DRIVE_CONNECTOR" || !connector.is_indexable
|
|
||||||
? false
|
|
||||||
: connector.periodic_indexing_enabled
|
|
||||||
);
|
|
||||||
setFrequencyMinutes(connector.indexing_frequency_minutes?.toString() || "1440");
|
setFrequencyMinutes(connector.indexing_frequency_minutes?.toString() || "1440");
|
||||||
// Reset dates - user can set new ones for re-indexing
|
// Reset dates - user can set new ones for re-indexing
|
||||||
setStartDate(undefined);
|
setStartDate(undefined);
|
||||||
|
|
@ -1117,6 +1117,24 @@ export const useConnectorDialog = () => {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Prevent periodic indexing for Google Drive without folders/files selected
|
||||||
|
if (periodicEnabled && editingConnector.connector_type === "GOOGLE_DRIVE_CONNECTOR") {
|
||||||
|
const selectedFolders = (connectorConfig || editingConnector.config)?.selected_folders as
|
||||||
|
| Array<{ id: string; name: string }>
|
||||||
|
| undefined;
|
||||||
|
const selectedFiles = (connectorConfig || editingConnector.config)?.selected_files as
|
||||||
|
| Array<{ id: string; name: string }>
|
||||||
|
| undefined;
|
||||||
|
const hasItemsSelected =
|
||||||
|
(selectedFolders && selectedFolders.length > 0) ||
|
||||||
|
(selectedFiles && selectedFiles.length > 0);
|
||||||
|
|
||||||
|
if (!hasItemsSelected) {
|
||||||
|
toast.error("Select at least one folder or file to enable periodic sync");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Validate frequency minutes if periodic is enabled (only for indexable connectors)
|
// Validate frequency minutes if periodic is enabled (only for indexable connectors)
|
||||||
if (periodicEnabled && editingConnector.is_indexable) {
|
if (periodicEnabled && editingConnector.is_indexable) {
|
||||||
const frequencyValidation = frequencyMinutesSchema.safeParse(frequencyMinutes);
|
const frequencyValidation = frequencyMinutesSchema.safeParse(frequencyMinutes);
|
||||||
|
|
@ -1132,23 +1150,14 @@ export const useConnectorDialog = () => {
|
||||||
const endDateStr = endDate ? format(endDate, "yyyy-MM-dd") : undefined;
|
const endDateStr = endDate ? format(endDate, "yyyy-MM-dd") : undefined;
|
||||||
|
|
||||||
// Update connector with periodic sync settings, config changes, and name
|
// Update connector with periodic sync settings, config changes, and name
|
||||||
// Note: Periodic sync is disabled for Google Drive connectors and non-indexable connectors
|
|
||||||
const frequency =
|
const frequency =
|
||||||
periodicEnabled && editingConnector.is_indexable ? parseInt(frequencyMinutes, 10) : null;
|
periodicEnabled && editingConnector.is_indexable ? parseInt(frequencyMinutes, 10) : null;
|
||||||
await updateConnector({
|
await updateConnector({
|
||||||
id: editingConnector.id,
|
id: editingConnector.id,
|
||||||
data: {
|
data: {
|
||||||
name: connectorName || editingConnector.name,
|
name: connectorName || editingConnector.name,
|
||||||
periodic_indexing_enabled:
|
periodic_indexing_enabled: !editingConnector.is_indexable ? false : periodicEnabled,
|
||||||
editingConnector.connector_type === "GOOGLE_DRIVE_CONNECTOR" ||
|
indexing_frequency_minutes: !editingConnector.is_indexable ? null : frequency,
|
||||||
!editingConnector.is_indexable
|
|
||||||
? false
|
|
||||||
: periodicEnabled,
|
|
||||||
indexing_frequency_minutes:
|
|
||||||
editingConnector.connector_type === "GOOGLE_DRIVE_CONNECTOR" ||
|
|
||||||
!editingConnector.is_indexable
|
|
||||||
? null
|
|
||||||
: frequency,
|
|
||||||
config: connectorConfig || editingConnector.config,
|
config: connectorConfig || editingConnector.config,
|
||||||
},
|
},
|
||||||
});
|
});
|
||||||
|
|
@ -1166,6 +1175,13 @@ export const useConnectorDialog = () => {
|
||||||
const selectedFiles = (connectorConfig || editingConnector.config)?.selected_files as
|
const selectedFiles = (connectorConfig || editingConnector.config)?.selected_files as
|
||||||
| Array<{ id: string; name: string }>
|
| Array<{ id: string; name: string }>
|
||||||
| undefined;
|
| undefined;
|
||||||
|
const indexingOptions = (connectorConfig || editingConnector.config)?.indexing_options as
|
||||||
|
| {
|
||||||
|
max_files_per_folder: number;
|
||||||
|
incremental_sync: boolean;
|
||||||
|
include_subfolders: boolean;
|
||||||
|
}
|
||||||
|
| undefined;
|
||||||
if (
|
if (
|
||||||
(selectedFolders && selectedFolders.length > 0) ||
|
(selectedFolders && selectedFolders.length > 0) ||
|
||||||
(selectedFiles && selectedFiles.length > 0)
|
(selectedFiles && selectedFiles.length > 0)
|
||||||
|
|
@ -1178,6 +1194,11 @@ export const useConnectorDialog = () => {
|
||||||
body: {
|
body: {
|
||||||
folders: selectedFolders || [],
|
folders: selectedFolders || [],
|
||||||
files: selectedFiles || [],
|
files: selectedFiles || [],
|
||||||
|
indexing_options: indexingOptions || {
|
||||||
|
max_files_per_folder: 100,
|
||||||
|
incremental_sync: true,
|
||||||
|
include_subfolders: true,
|
||||||
|
},
|
||||||
},
|
},
|
||||||
});
|
});
|
||||||
const totalItems = (selectedFolders?.length || 0) + (selectedFiles?.length || 0);
|
const totalItems = (selectedFolders?.length || 0) + (selectedFiles?.length || 0);
|
||||||
|
|
@ -1221,12 +1242,8 @@ export const useConnectorDialog = () => {
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Track periodic indexing if enabled (for non-Google Drive connectors)
|
// Track periodic indexing if enabled
|
||||||
if (
|
if (periodicEnabled && editingConnector.is_indexable) {
|
||||||
periodicEnabled &&
|
|
||||||
editingConnector.is_indexable &&
|
|
||||||
editingConnector.connector_type !== "GOOGLE_DRIVE_CONNECTOR"
|
|
||||||
) {
|
|
||||||
trackPeriodicIndexingStarted(
|
trackPeriodicIndexingStarted(
|
||||||
Number(searchSpaceId),
|
Number(searchSpaceId),
|
||||||
editingConnector.connector_type,
|
editingConnector.connector_type,
|
||||||
|
|
|
||||||
|
|
@ -5,13 +5,13 @@ import {
|
||||||
ChevronRight,
|
ChevronRight,
|
||||||
File,
|
File,
|
||||||
FileText,
|
FileText,
|
||||||
Folder,
|
FolderClosed,
|
||||||
FolderOpen,
|
FolderOpen,
|
||||||
HardDrive,
|
HardDrive,
|
||||||
Image,
|
Image,
|
||||||
Loader2,
|
Loader2,
|
||||||
Presentation,
|
Presentation,
|
||||||
Sheet,
|
FileSpreadsheet,
|
||||||
} from "lucide-react";
|
} from "lucide-react";
|
||||||
import { useState } from "react";
|
import { useState } from "react";
|
||||||
import { Checkbox } from "@/components/ui/checkbox";
|
import { Checkbox } from "@/components/ui/checkbox";
|
||||||
|
|
@ -53,16 +53,16 @@ interface GoogleDriveFolderTreeProps {
|
||||||
// Helper to get appropriate icon for file type
|
// Helper to get appropriate icon for file type
|
||||||
function getFileIcon(mimeType: string, className: string = "h-4 w-4") {
|
function getFileIcon(mimeType: string, className: string = "h-4 w-4") {
|
||||||
if (mimeType.includes("spreadsheet") || mimeType.includes("excel")) {
|
if (mimeType.includes("spreadsheet") || mimeType.includes("excel")) {
|
||||||
return <Sheet className={`${className} text-green-600`} />;
|
return <FileSpreadsheet className={`${className} text-green-500`} />;
|
||||||
}
|
}
|
||||||
if (mimeType.includes("presentation") || mimeType.includes("powerpoint")) {
|
if (mimeType.includes("presentation") || mimeType.includes("powerpoint")) {
|
||||||
return <Presentation className={`${className} text-orange-600`} />;
|
return <Presentation className={`${className} text-orange-500`} />;
|
||||||
}
|
}
|
||||||
if (mimeType.includes("document") || mimeType.includes("word") || mimeType.includes("text")) {
|
if (mimeType.includes("document") || mimeType.includes("word") || mimeType.includes("text")) {
|
||||||
return <FileText className={`${className} text-blue-600`} />;
|
return <FileText className={`${className} text-gray-500`} />;
|
||||||
}
|
}
|
||||||
if (mimeType.includes("image")) {
|
if (mimeType.includes("image")) {
|
||||||
return <Image className={`${className} text-purple-600`} />;
|
return <Image className={`${className} text-purple-500`} />;
|
||||||
}
|
}
|
||||||
return <File className={`${className} text-gray-500`} />;
|
return <File className={`${className} text-gray-500`} />;
|
||||||
}
|
}
|
||||||
|
|
@ -280,9 +280,9 @@ export function GoogleDriveFolderTree({
|
||||||
<div className="shrink-0">
|
<div className="shrink-0">
|
||||||
{isFolder ? (
|
{isFolder ? (
|
||||||
isExpanded ? (
|
isExpanded ? (
|
||||||
<FolderOpen className="h-3 w-3 sm:h-4 sm:w-4 text-blue-500" />
|
<FolderOpen className="h-3 w-3 sm:h-4 sm:w-4 text-gray-500" />
|
||||||
) : (
|
) : (
|
||||||
<Folder className="h-3 w-3 sm:h-4 sm:w-4 text-gray-500" />
|
<FolderClosed className="h-3 w-3 sm:h-4 sm:w-4 text-gray-500" />
|
||||||
)
|
)
|
||||||
) : (
|
) : (
|
||||||
getFileIcon(item.mimeType, "h-3 w-3 sm:h-4 sm:w-4")
|
getFileIcon(item.mimeType, "h-3 w-3 sm:h-4 sm:w-4")
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue