mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-04-25 00:36:31 +02:00
chore: ran backend and frontend linting
This commit is contained in:
parent
f538d59ca3
commit
e0be1b9133
8 changed files with 227 additions and 126 deletions
|
|
@ -90,11 +90,11 @@ __all__ = [
|
|||
"DocumentsCreate",
|
||||
# Google Drive schemas
|
||||
"DriveItem",
|
||||
"GoogleDriveIndexingOptions",
|
||||
"ExtensionDocumentContent",
|
||||
"ExtensionDocumentMetadata",
|
||||
"GlobalNewLLMConfigRead",
|
||||
"GoogleDriveIndexRequest",
|
||||
"GoogleDriveIndexingOptions",
|
||||
# Base schemas
|
||||
"IDModel",
|
||||
# RBAC schemas
|
||||
|
|
|
|||
|
|
@ -110,15 +110,21 @@ async def _check_and_trigger_schedules():
|
|||
)
|
||||
|
||||
# Special handling for Google Drive - uses config for folder/file selection
|
||||
if connector.connector_type == SearchSourceConnectorType.GOOGLE_DRIVE_CONNECTOR:
|
||||
if (
|
||||
connector.connector_type
|
||||
== SearchSourceConnectorType.GOOGLE_DRIVE_CONNECTOR
|
||||
):
|
||||
connector_config = connector.config or {}
|
||||
selected_folders = connector_config.get("selected_folders", [])
|
||||
selected_files = connector_config.get("selected_files", [])
|
||||
indexing_options = connector_config.get("indexing_options", {
|
||||
"max_files_per_folder": 100,
|
||||
"incremental_sync": True,
|
||||
"include_subfolders": True,
|
||||
})
|
||||
indexing_options = connector_config.get(
|
||||
"indexing_options",
|
||||
{
|
||||
"max_files_per_folder": 100,
|
||||
"incremental_sync": True,
|
||||
"include_subfolders": True,
|
||||
},
|
||||
)
|
||||
|
||||
if selected_folders or selected_files:
|
||||
task.delay(
|
||||
|
|
@ -139,6 +145,7 @@ async def _check_and_trigger_schedules():
|
|||
"skipping periodic indexing (will check again at next scheduled time)"
|
||||
)
|
||||
from datetime import timedelta
|
||||
|
||||
connector.next_scheduled_at = now + timedelta(
|
||||
minutes=connector.indexing_frequency_minutes
|
||||
)
|
||||
|
|
|
|||
|
|
@ -388,7 +388,11 @@ async def _index_full_scan(
|
|||
await task_logger.log_task_progress(
|
||||
log_entry,
|
||||
f"Starting full scan of folder: {folder_name} (include_subfolders={include_subfolders})",
|
||||
{"stage": "full_scan", "folder_id": folder_id, "include_subfolders": include_subfolders},
|
||||
{
|
||||
"stage": "full_scan",
|
||||
"folder_id": folder_id,
|
||||
"include_subfolders": include_subfolders,
|
||||
},
|
||||
)
|
||||
|
||||
documents_indexed = 0
|
||||
|
|
@ -407,7 +411,10 @@ async def _index_full_scan(
|
|||
# Get files and folders in current folder
|
||||
# include_subfolders=True here so we get folder items to queue them
|
||||
files, next_token, error = await get_files_in_folder(
|
||||
drive_client, current_folder_id, include_subfolders=True, page_token=page_token
|
||||
drive_client,
|
||||
current_folder_id,
|
||||
include_subfolders=True,
|
||||
page_token=page_token,
|
||||
)
|
||||
|
||||
if error:
|
||||
|
|
@ -426,7 +433,9 @@ async def _index_full_scan(
|
|||
# If this is a folder and include_subfolders is enabled, queue it for processing
|
||||
if mime_type == "application/vnd.google-apps.folder":
|
||||
if include_subfolders:
|
||||
folders_to_process.append((file["id"], file.get("name", "Unknown")))
|
||||
folders_to_process.append(
|
||||
(file["id"], file.get("name", "Unknown"))
|
||||
)
|
||||
logger.debug(f"Queued subfolder: {file.get('name', 'Unknown')}")
|
||||
continue
|
||||
|
||||
|
|
@ -478,7 +487,7 @@ async def _index_with_delta_sync(
|
|||
include_subfolders: bool = False,
|
||||
) -> tuple[int, int]:
|
||||
"""Perform delta sync indexing using change tracking.
|
||||
|
||||
|
||||
Note: include_subfolders is accepted for API consistency but delta sync
|
||||
automatically tracks changes across all folders including subfolders.
|
||||
"""
|
||||
|
|
@ -554,16 +563,16 @@ async def _check_rename_only_update(
|
|||
) -> tuple[bool, str | None]:
|
||||
"""
|
||||
Check if a file only needs a rename update (no content change).
|
||||
|
||||
|
||||
Uses md5Checksum comparison (preferred) or modifiedTime (fallback for Google Workspace files)
|
||||
to detect if content has changed. This optimization prevents unnecessary ETL API calls
|
||||
(Docling/LlamaCloud) for rename-only operations.
|
||||
|
||||
|
||||
Args:
|
||||
session: Database session
|
||||
file: File metadata from Google Drive API
|
||||
search_space_id: ID of the search space
|
||||
|
||||
|
||||
Returns:
|
||||
Tuple of (is_rename_only, message)
|
||||
- (True, message): Only filename changed, document was updated
|
||||
|
|
@ -571,69 +580,76 @@ async def _check_rename_only_update(
|
|||
"""
|
||||
from sqlalchemy import select
|
||||
from sqlalchemy.orm.attributes import flag_modified
|
||||
|
||||
from app.db import Document
|
||||
|
||||
|
||||
file_id = file.get("id")
|
||||
file_name = file.get("name", "Unknown")
|
||||
incoming_md5 = file.get("md5Checksum") # None for Google Workspace files
|
||||
incoming_modified_time = file.get("modifiedTime")
|
||||
|
||||
|
||||
if not file_id:
|
||||
return False, None
|
||||
|
||||
|
||||
# Try to find existing document by file_id-based hash (primary method)
|
||||
primary_hash = generate_unique_identifier_hash(
|
||||
DocumentType.GOOGLE_DRIVE_FILE, file_id, search_space_id
|
||||
)
|
||||
existing_document = await check_document_by_unique_identifier(session, primary_hash)
|
||||
|
||||
|
||||
# If not found by primary hash, try searching by metadata (for legacy documents)
|
||||
if not existing_document:
|
||||
result = await session.execute(
|
||||
select(Document).where(
|
||||
Document.search_space_id == search_space_id,
|
||||
Document.document_type == DocumentType.GOOGLE_DRIVE_FILE,
|
||||
Document.document_metadata["google_drive_file_id"].astext == file_id
|
||||
Document.document_metadata["google_drive_file_id"].astext == file_id,
|
||||
)
|
||||
)
|
||||
existing_document = result.scalar_one_or_none()
|
||||
if existing_document:
|
||||
logger.debug(f"Found legacy document by metadata for file_id: {file_id}")
|
||||
|
||||
|
||||
if not existing_document:
|
||||
# New file, needs full processing
|
||||
return False, None
|
||||
|
||||
|
||||
# Get stored checksums/timestamps from document metadata
|
||||
doc_metadata = existing_document.document_metadata or {}
|
||||
stored_md5 = doc_metadata.get("md5_checksum")
|
||||
stored_modified_time = doc_metadata.get("modified_time")
|
||||
|
||||
|
||||
# Determine if content changed using md5Checksum (preferred) or modifiedTime (fallback)
|
||||
content_unchanged = False
|
||||
|
||||
|
||||
if incoming_md5 and stored_md5:
|
||||
# Best case: Compare md5 checksums (only changes when content changes, not on rename)
|
||||
content_unchanged = (incoming_md5 == stored_md5)
|
||||
content_unchanged = incoming_md5 == stored_md5
|
||||
logger.debug(f"MD5 comparison for {file_name}: unchanged={content_unchanged}")
|
||||
elif incoming_md5 and not stored_md5:
|
||||
# Have incoming md5 but no stored md5 (legacy doc) - need to reprocess to store it
|
||||
logger.debug(f"No stored md5 for {file_name}, will reprocess to store md5_checksum")
|
||||
logger.debug(
|
||||
f"No stored md5 for {file_name}, will reprocess to store md5_checksum"
|
||||
)
|
||||
return False, None
|
||||
elif not incoming_md5:
|
||||
# Google Workspace file (no md5Checksum available) - fall back to modifiedTime
|
||||
# Note: modifiedTime is less reliable as it changes on rename too, but it's the best we have
|
||||
if incoming_modified_time and stored_modified_time:
|
||||
content_unchanged = (incoming_modified_time == stored_modified_time)
|
||||
logger.debug(f"ModifiedTime fallback for Google Workspace file {file_name}: unchanged={content_unchanged}")
|
||||
content_unchanged = incoming_modified_time == stored_modified_time
|
||||
logger.debug(
|
||||
f"ModifiedTime fallback for Google Workspace file {file_name}: unchanged={content_unchanged}"
|
||||
)
|
||||
else:
|
||||
# No stored modifiedTime (legacy) - reprocess to store it
|
||||
return False, None
|
||||
|
||||
|
||||
if content_unchanged:
|
||||
# Content hasn't changed - check if filename changed
|
||||
old_name = doc_metadata.get("FILE_NAME") or doc_metadata.get("google_drive_file_name")
|
||||
|
||||
old_name = doc_metadata.get("FILE_NAME") or doc_metadata.get(
|
||||
"google_drive_file_name"
|
||||
)
|
||||
|
||||
if old_name and old_name != file_name:
|
||||
# Rename-only update - update the document without re-processing
|
||||
existing_document.title = file_name
|
||||
|
|
@ -643,17 +659,24 @@ async def _check_rename_only_update(
|
|||
existing_document.document_metadata["google_drive_file_name"] = file_name
|
||||
# Also update modified_time for Google Workspace files (since it changed on rename)
|
||||
if incoming_modified_time:
|
||||
existing_document.document_metadata["modified_time"] = incoming_modified_time
|
||||
existing_document.document_metadata["modified_time"] = (
|
||||
incoming_modified_time
|
||||
)
|
||||
flag_modified(existing_document, "document_metadata")
|
||||
await session.commit()
|
||||
|
||||
logger.info(f"Rename-only update: '{old_name}' → '{file_name}' (skipped ETL)")
|
||||
return True, f"File renamed: '{old_name}' → '{file_name}' (no content change)"
|
||||
|
||||
logger.info(
|
||||
f"Rename-only update: '{old_name}' → '{file_name}' (skipped ETL)"
|
||||
)
|
||||
return (
|
||||
True,
|
||||
f"File renamed: '{old_name}' → '{file_name}' (no content change)",
|
||||
)
|
||||
else:
|
||||
# Neither content nor name changed
|
||||
logger.debug(f"File unchanged: {file_name}")
|
||||
return True, "File unchanged (same content and name)"
|
||||
|
||||
|
||||
# Content changed - needs full processing
|
||||
return False, None
|
||||
|
||||
|
|
@ -688,7 +711,7 @@ async def _process_single_file(
|
|||
file=file,
|
||||
search_space_id=search_space_id,
|
||||
)
|
||||
|
||||
|
||||
if is_rename_only:
|
||||
await task_logger.log_task_progress(
|
||||
log_entry,
|
||||
|
|
@ -729,12 +752,13 @@ async def _process_single_file(
|
|||
|
||||
async def _remove_document(session: AsyncSession, file_id: str, search_space_id: int):
|
||||
"""Remove a document that was deleted in Drive.
|
||||
|
||||
|
||||
Handles both new (file_id-based) and legacy (filename-based) hash schemes.
|
||||
"""
|
||||
from sqlalchemy import select
|
||||
|
||||
from app.db import Document
|
||||
|
||||
|
||||
# First try with file_id-based hash (new method)
|
||||
unique_identifier_hash = generate_unique_identifier_hash(
|
||||
DocumentType.GOOGLE_DRIVE_FILE, file_id, search_space_id
|
||||
|
|
@ -750,7 +774,7 @@ async def _remove_document(session: AsyncSession, file_id: str, search_space_id:
|
|||
select(Document).where(
|
||||
Document.search_space_id == search_space_id,
|
||||
Document.document_type == DocumentType.GOOGLE_DRIVE_FILE,
|
||||
Document.document_metadata["google_drive_file_id"].astext == file_id
|
||||
Document.document_metadata["google_drive_file_id"].astext == file_id,
|
||||
)
|
||||
)
|
||||
existing_document = result.scalar_one_or_none()
|
||||
|
|
|
|||
|
|
@ -57,15 +57,15 @@ def get_google_drive_unique_identifier(
|
|||
) -> tuple[str, str | None]:
|
||||
"""
|
||||
Get unique identifier hash for a file, with special handling for Google Drive.
|
||||
|
||||
|
||||
For Google Drive files, uses file_id as the unique identifier (doesn't change on rename).
|
||||
For other files, uses filename.
|
||||
|
||||
|
||||
Args:
|
||||
connector: Optional connector info dict with type and metadata
|
||||
filename: The filename (used for non-Google Drive files or as fallback)
|
||||
search_space_id: The search space ID
|
||||
|
||||
|
||||
Returns:
|
||||
Tuple of (primary_hash, legacy_hash or None)
|
||||
- For Google Drive: (file_id_based_hash, filename_based_hash for migration)
|
||||
|
|
@ -74,7 +74,7 @@ def get_google_drive_unique_identifier(
|
|||
if connector and connector.get("type") == DocumentType.GOOGLE_DRIVE_FILE:
|
||||
metadata = connector.get("metadata", {})
|
||||
file_id = metadata.get("google_drive_file_id")
|
||||
|
||||
|
||||
if file_id:
|
||||
# New method: use file_id as unique identifier (doesn't change on rename)
|
||||
primary_hash = generate_unique_identifier_hash(
|
||||
|
|
@ -86,7 +86,7 @@ def get_google_drive_unique_identifier(
|
|||
DocumentType.GOOGLE_DRIVE_FILE, filename, search_space_id
|
||||
)
|
||||
return primary_hash, legacy_hash
|
||||
|
||||
|
||||
# For non-Google Drive files, use filename as before
|
||||
primary_hash = generate_unique_identifier_hash(
|
||||
DocumentType.FILE, filename, search_space_id
|
||||
|
|
@ -104,7 +104,7 @@ async def handle_existing_document_update(
|
|||
) -> tuple[bool, Document | None]:
|
||||
"""
|
||||
Handle update logic for an existing document.
|
||||
|
||||
|
||||
Args:
|
||||
session: Database session
|
||||
existing_document: The existing document found in database
|
||||
|
|
@ -112,7 +112,7 @@ async def handle_existing_document_update(
|
|||
connector: Optional connector info
|
||||
filename: Current filename
|
||||
primary_hash: The primary hash (file_id based for Google Drive)
|
||||
|
||||
|
||||
Returns:
|
||||
Tuple of (should_skip_processing, document_to_return)
|
||||
- (True, document): Content unchanged, just return existing document
|
||||
|
|
@ -122,7 +122,7 @@ async def handle_existing_document_update(
|
|||
if existing_document.unique_identifier_hash != primary_hash:
|
||||
existing_document.unique_identifier_hash = primary_hash
|
||||
logging.info(f"Migrated document to file_id-based identifier: {filename}")
|
||||
|
||||
|
||||
# Check if content has changed
|
||||
if existing_document.content_hash == content_hash:
|
||||
# Content unchanged - check if we need to update metadata (e.g., filename changed)
|
||||
|
|
@ -131,12 +131,14 @@ async def handle_existing_document_update(
|
|||
new_name = connector_metadata.get("google_drive_file_name")
|
||||
# Check both possible keys for old name (FILE_NAME is used in stored documents)
|
||||
doc_metadata = existing_document.document_metadata or {}
|
||||
old_name = doc_metadata.get("FILE_NAME") or doc_metadata.get("google_drive_file_name")
|
||||
|
||||
old_name = doc_metadata.get("FILE_NAME") or doc_metadata.get(
|
||||
"google_drive_file_name"
|
||||
)
|
||||
|
||||
if new_name and old_name and old_name != new_name:
|
||||
# File was renamed - update title and metadata, skip expensive processing
|
||||
from sqlalchemy.orm.attributes import flag_modified
|
||||
|
||||
|
||||
existing_document.title = new_name
|
||||
if not existing_document.document_metadata:
|
||||
existing_document.document_metadata = {}
|
||||
|
|
@ -144,8 +146,10 @@ async def handle_existing_document_update(
|
|||
existing_document.document_metadata["google_drive_file_name"] = new_name
|
||||
flag_modified(existing_document, "document_metadata")
|
||||
await session.commit()
|
||||
logging.info(f"File renamed in Google Drive: '{old_name}' → '{new_name}' (no re-processing needed)")
|
||||
|
||||
logging.info(
|
||||
f"File renamed in Google Drive: '{old_name}' → '{new_name}' (no re-processing needed)"
|
||||
)
|
||||
|
||||
logging.info(f"Document for file {filename} unchanged. Skipping.")
|
||||
return True, existing_document
|
||||
else:
|
||||
|
|
@ -163,25 +167,29 @@ async def find_existing_document_with_migration(
|
|||
"""
|
||||
Find existing document, checking both new hash and legacy hash for migration,
|
||||
with fallback to content_hash for cross-source deduplication.
|
||||
|
||||
|
||||
Args:
|
||||
session: Database session
|
||||
primary_hash: The primary hash (file_id based for Google Drive)
|
||||
legacy_hash: The legacy hash (filename based) for migration, or None
|
||||
content_hash: The content hash for fallback deduplication, or None
|
||||
|
||||
|
||||
Returns:
|
||||
Existing document if found, None otherwise
|
||||
"""
|
||||
# First check with primary hash (new method)
|
||||
existing_document = await check_document_by_unique_identifier(session, primary_hash)
|
||||
|
||||
|
||||
# If not found and we have a legacy hash, check with that (migration path)
|
||||
if not existing_document and legacy_hash:
|
||||
existing_document = await check_document_by_unique_identifier(session, legacy_hash)
|
||||
existing_document = await check_document_by_unique_identifier(
|
||||
session, legacy_hash
|
||||
)
|
||||
if existing_document:
|
||||
logging.info("Found legacy document (filename-based hash), will migrate to file_id-based hash")
|
||||
|
||||
logging.info(
|
||||
"Found legacy document (filename-based hash), will migrate to file_id-based hash"
|
||||
)
|
||||
|
||||
# Fallback: check by content_hash to catch duplicates from different sources
|
||||
# This prevents unique constraint violations when the same content exists
|
||||
# under a different unique_identifier (e.g., manual upload vs Google Drive)
|
||||
|
|
@ -192,7 +200,7 @@ async def find_existing_document_with_migration(
|
|||
f"Found duplicate content from different source (content_hash match). "
|
||||
f"Original document ID: {existing_document.id}, type: {existing_document.document_type}"
|
||||
)
|
||||
|
||||
|
||||
return existing_document
|
||||
|
||||
|
||||
|
|
@ -342,7 +350,12 @@ async def add_received_file_document_using_unstructured(
|
|||
if existing_document:
|
||||
# Handle existing document (rename detection, content change check)
|
||||
should_skip, doc = await handle_existing_document_update(
|
||||
session, existing_document, content_hash, connector, file_name, primary_hash
|
||||
session,
|
||||
existing_document,
|
||||
content_hash,
|
||||
connector,
|
||||
file_name,
|
||||
primary_hash,
|
||||
)
|
||||
if should_skip:
|
||||
return doc
|
||||
|
|
@ -402,7 +415,7 @@ async def add_received_file_document_using_unstructured(
|
|||
doc_type = DocumentType.FILE
|
||||
if connector and connector.get("type") == DocumentType.GOOGLE_DRIVE_FILE:
|
||||
doc_type = DocumentType.GOOGLE_DRIVE_FILE
|
||||
|
||||
|
||||
document = Document(
|
||||
search_space_id=search_space_id,
|
||||
title=file_name,
|
||||
|
|
@ -476,7 +489,12 @@ async def add_received_file_document_using_llamacloud(
|
|||
if existing_document:
|
||||
# Handle existing document (rename detection, content change check)
|
||||
should_skip, doc = await handle_existing_document_update(
|
||||
session, existing_document, content_hash, connector, file_name, primary_hash
|
||||
session,
|
||||
existing_document,
|
||||
content_hash,
|
||||
connector,
|
||||
file_name,
|
||||
primary_hash,
|
||||
)
|
||||
if should_skip:
|
||||
return doc
|
||||
|
|
@ -536,7 +554,7 @@ async def add_received_file_document_using_llamacloud(
|
|||
doc_type = DocumentType.FILE
|
||||
if connector and connector.get("type") == DocumentType.GOOGLE_DRIVE_FILE:
|
||||
doc_type = DocumentType.GOOGLE_DRIVE_FILE
|
||||
|
||||
|
||||
document = Document(
|
||||
search_space_id=search_space_id,
|
||||
title=file_name,
|
||||
|
|
@ -611,7 +629,12 @@ async def add_received_file_document_using_docling(
|
|||
if existing_document:
|
||||
# Handle existing document (rename detection, content change check)
|
||||
should_skip, doc = await handle_existing_document_update(
|
||||
session, existing_document, content_hash, connector, file_name, primary_hash
|
||||
session,
|
||||
existing_document,
|
||||
content_hash,
|
||||
connector,
|
||||
file_name,
|
||||
primary_hash,
|
||||
)
|
||||
if should_skip:
|
||||
return doc
|
||||
|
|
@ -695,7 +718,7 @@ async def add_received_file_document_using_docling(
|
|||
doc_type = DocumentType.FILE
|
||||
if connector and connector.get("type") == DocumentType.GOOGLE_DRIVE_FILE:
|
||||
doc_type = DocumentType.GOOGLE_DRIVE_FILE
|
||||
|
||||
|
||||
document = Document(
|
||||
search_space_id=search_space_id,
|
||||
title=file_name,
|
||||
|
|
|
|||
|
|
@ -31,22 +31,22 @@ def _get_google_drive_unique_identifier(
|
|||
) -> tuple[str, str | None]:
|
||||
"""
|
||||
Get unique identifier hash for a file, with special handling for Google Drive.
|
||||
|
||||
|
||||
For Google Drive files, uses file_id as the unique identifier (doesn't change on rename).
|
||||
For other files, uses filename.
|
||||
|
||||
|
||||
Args:
|
||||
connector: Optional connector info dict with type and metadata
|
||||
filename: The filename (used for non-Google Drive files or as fallback)
|
||||
search_space_id: The search space ID
|
||||
|
||||
|
||||
Returns:
|
||||
Tuple of (primary_hash, legacy_hash or None)
|
||||
"""
|
||||
if connector and connector.get("type") == DocumentType.GOOGLE_DRIVE_FILE:
|
||||
metadata = connector.get("metadata", {})
|
||||
file_id = metadata.get("google_drive_file_id")
|
||||
|
||||
|
||||
if file_id:
|
||||
primary_hash = generate_unique_identifier_hash(
|
||||
DocumentType.GOOGLE_DRIVE_FILE, file_id, search_space_id
|
||||
|
|
@ -55,7 +55,7 @@ def _get_google_drive_unique_identifier(
|
|||
DocumentType.GOOGLE_DRIVE_FILE, filename, search_space_id
|
||||
)
|
||||
return primary_hash, legacy_hash
|
||||
|
||||
|
||||
primary_hash = generate_unique_identifier_hash(
|
||||
DocumentType.FILE, filename, search_space_id
|
||||
)
|
||||
|
|
@ -73,12 +73,16 @@ async def _find_existing_document_with_migration(
|
|||
with fallback to content_hash for cross-source deduplication.
|
||||
"""
|
||||
existing_document = await check_document_by_unique_identifier(session, primary_hash)
|
||||
|
||||
|
||||
if not existing_document and legacy_hash:
|
||||
existing_document = await check_document_by_unique_identifier(session, legacy_hash)
|
||||
existing_document = await check_document_by_unique_identifier(
|
||||
session, legacy_hash
|
||||
)
|
||||
if existing_document:
|
||||
logging.info("Found legacy document (filename-based hash), will migrate to file_id-based hash")
|
||||
|
||||
logging.info(
|
||||
"Found legacy document (filename-based hash), will migrate to file_id-based hash"
|
||||
)
|
||||
|
||||
# Fallback: check by content_hash to catch duplicates from different sources
|
||||
if not existing_document and content_hash:
|
||||
existing_document = await check_duplicate_document(session, content_hash)
|
||||
|
|
@ -87,7 +91,7 @@ async def _find_existing_document_with_migration(
|
|||
f"Found duplicate content from different source (content_hash match). "
|
||||
f"Original document ID: {existing_document.id}, type: {existing_document.document_type}"
|
||||
)
|
||||
|
||||
|
||||
return existing_document
|
||||
|
||||
|
||||
|
|
@ -103,7 +107,7 @@ async def _handle_existing_document_update(
|
|||
) -> tuple[bool, Document | None]:
|
||||
"""
|
||||
Handle update logic for an existing document.
|
||||
|
||||
|
||||
Returns:
|
||||
Tuple of (should_skip_processing, document_to_return)
|
||||
"""
|
||||
|
|
@ -111,7 +115,7 @@ async def _handle_existing_document_update(
|
|||
if existing_document.unique_identifier_hash != primary_hash:
|
||||
existing_document.unique_identifier_hash = primary_hash
|
||||
logging.info(f"Migrated document to file_id-based identifier: {filename}")
|
||||
|
||||
|
||||
# Check if content has changed
|
||||
if existing_document.content_hash == content_hash:
|
||||
# Content unchanged - check if we need to update metadata (e.g., filename changed)
|
||||
|
|
@ -120,12 +124,16 @@ async def _handle_existing_document_update(
|
|||
new_name = connector_metadata.get("google_drive_file_name")
|
||||
# Check both possible keys for old name (FILE_NAME is used in stored documents)
|
||||
doc_metadata = existing_document.document_metadata or {}
|
||||
old_name = doc_metadata.get("FILE_NAME") or doc_metadata.get("google_drive_file_name") or doc_metadata.get("file_name")
|
||||
|
||||
old_name = (
|
||||
doc_metadata.get("FILE_NAME")
|
||||
or doc_metadata.get("google_drive_file_name")
|
||||
or doc_metadata.get("file_name")
|
||||
)
|
||||
|
||||
if new_name and old_name and old_name != new_name:
|
||||
# File was renamed - update title and metadata, skip expensive processing
|
||||
from sqlalchemy.orm.attributes import flag_modified
|
||||
|
||||
|
||||
existing_document.title = new_name
|
||||
if not existing_document.document_metadata:
|
||||
existing_document.document_metadata = {}
|
||||
|
|
@ -134,8 +142,10 @@ async def _handle_existing_document_update(
|
|||
existing_document.document_metadata["google_drive_file_name"] = new_name
|
||||
flag_modified(existing_document, "document_metadata")
|
||||
await session.commit()
|
||||
logging.info(f"File renamed in Google Drive: '{old_name}' → '{new_name}' (no re-processing needed)")
|
||||
|
||||
logging.info(
|
||||
f"File renamed in Google Drive: '{old_name}' → '{new_name}' (no re-processing needed)"
|
||||
)
|
||||
|
||||
await task_logger.log_task_success(
|
||||
log_entry,
|
||||
f"Markdown file document unchanged: {filename}",
|
||||
|
|
@ -147,7 +157,9 @@ async def _handle_existing_document_update(
|
|||
logging.info(f"Document for markdown file {filename} unchanged. Skipping.")
|
||||
return True, existing_document
|
||||
else:
|
||||
logging.info(f"Content changed for markdown file {filename}. Updating document.")
|
||||
logging.info(
|
||||
f"Content changed for markdown file {filename}. Updating document."
|
||||
)
|
||||
return False, None
|
||||
|
||||
|
||||
|
|
@ -204,8 +216,14 @@ async def add_received_markdown_file_document(
|
|||
if existing_document:
|
||||
# Handle existing document (rename detection, content change check)
|
||||
should_skip, doc = await _handle_existing_document_update(
|
||||
session, existing_document, content_hash, connector, file_name, primary_hash,
|
||||
task_logger, log_entry
|
||||
session,
|
||||
existing_document,
|
||||
content_hash,
|
||||
connector,
|
||||
file_name,
|
||||
primary_hash,
|
||||
task_logger,
|
||||
log_entry,
|
||||
)
|
||||
if should_skip:
|
||||
return doc
|
||||
|
|
@ -262,7 +280,7 @@ async def add_received_markdown_file_document(
|
|||
doc_type = DocumentType.FILE
|
||||
if connector and connector.get("type") == DocumentType.GOOGLE_DRIVE_FILE:
|
||||
doc_type = DocumentType.GOOGLE_DRIVE_FILE
|
||||
|
||||
|
||||
document = Document(
|
||||
search_space_id=search_space_id,
|
||||
title=file_name,
|
||||
|
|
|
|||
|
|
@ -38,11 +38,7 @@ export const PeriodicSyncConfig: FC<PeriodicSyncConfigProps> = ({
|
|||
Automatically re-index at regular intervals
|
||||
</p>
|
||||
</div>
|
||||
<Switch
|
||||
checked={enabled}
|
||||
onCheckedChange={onEnabledChange}
|
||||
disabled={disabled}
|
||||
/>
|
||||
<Switch checked={enabled} onCheckedChange={onEnabledChange} disabled={disabled} />
|
||||
</div>
|
||||
|
||||
{/* Show disabled message when periodic sync can't be enabled */}
|
||||
|
|
|
|||
|
|
@ -37,19 +37,42 @@ const DEFAULT_INDEXING_OPTIONS: IndexingOptions = {
|
|||
function getFileIconFromName(fileName: string, className: string = "size-3.5 shrink-0") {
|
||||
const lowerName = fileName.toLowerCase();
|
||||
// Spreadsheets
|
||||
if (lowerName.endsWith(".xlsx") || lowerName.endsWith(".xls") || lowerName.endsWith(".csv") || lowerName.includes("spreadsheet")) {
|
||||
if (
|
||||
lowerName.endsWith(".xlsx") ||
|
||||
lowerName.endsWith(".xls") ||
|
||||
lowerName.endsWith(".csv") ||
|
||||
lowerName.includes("spreadsheet")
|
||||
) {
|
||||
return <FileSpreadsheet className={`${className} text-green-500`} />;
|
||||
}
|
||||
// Presentations
|
||||
if (lowerName.endsWith(".pptx") || lowerName.endsWith(".ppt") || lowerName.includes("presentation")) {
|
||||
if (
|
||||
lowerName.endsWith(".pptx") ||
|
||||
lowerName.endsWith(".ppt") ||
|
||||
lowerName.includes("presentation")
|
||||
) {
|
||||
return <Presentation className={`${className} text-orange-500`} />;
|
||||
}
|
||||
// Documents (word, text only - not PDF)
|
||||
if (lowerName.endsWith(".docx") || lowerName.endsWith(".doc") || lowerName.endsWith(".txt") || lowerName.includes("document") || lowerName.includes("word") || lowerName.includes("text")) {
|
||||
if (
|
||||
lowerName.endsWith(".docx") ||
|
||||
lowerName.endsWith(".doc") ||
|
||||
lowerName.endsWith(".txt") ||
|
||||
lowerName.includes("document") ||
|
||||
lowerName.includes("word") ||
|
||||
lowerName.includes("text")
|
||||
) {
|
||||
return <FileText className={`${className} text-gray-500`} />;
|
||||
}
|
||||
// Images
|
||||
if (lowerName.endsWith(".png") || lowerName.endsWith(".jpg") || lowerName.endsWith(".jpeg") || lowerName.endsWith(".gif") || lowerName.endsWith(".webp") || lowerName.endsWith(".svg")) {
|
||||
if (
|
||||
lowerName.endsWith(".png") ||
|
||||
lowerName.endsWith(".jpg") ||
|
||||
lowerName.endsWith(".jpeg") ||
|
||||
lowerName.endsWith(".gif") ||
|
||||
lowerName.endsWith(".webp") ||
|
||||
lowerName.endsWith(".svg")
|
||||
) {
|
||||
return <Image className={`${className} text-purple-500`} />;
|
||||
}
|
||||
// Default (including PDF)
|
||||
|
|
@ -61,7 +84,8 @@ export const GoogleDriveConfig: FC<ConnectorConfigProps> = ({ connector, onConfi
|
|||
const existingFolders =
|
||||
(connector.config?.selected_folders as SelectedFolder[] | undefined) || [];
|
||||
const existingFiles = (connector.config?.selected_files as SelectedFolder[] | undefined) || [];
|
||||
const existingIndexingOptions = (connector.config?.indexing_options as IndexingOptions | undefined) || DEFAULT_INDEXING_OPTIONS;
|
||||
const existingIndexingOptions =
|
||||
(connector.config?.indexing_options as IndexingOptions | undefined) || DEFAULT_INDEXING_OPTIONS;
|
||||
|
||||
const [selectedFolders, setSelectedFolders] = useState<SelectedFolder[]>(existingFolders);
|
||||
const [selectedFiles, setSelectedFiles] = useState<SelectedFolder[]>(existingFiles);
|
||||
|
|
@ -72,7 +96,9 @@ export const GoogleDriveConfig: FC<ConnectorConfigProps> = ({ connector, onConfi
|
|||
useEffect(() => {
|
||||
const folders = (connector.config?.selected_folders as SelectedFolder[] | undefined) || [];
|
||||
const files = (connector.config?.selected_files as SelectedFolder[] | undefined) || [];
|
||||
const options = (connector.config?.indexing_options as IndexingOptions | undefined) || DEFAULT_INDEXING_OPTIONS;
|
||||
const options =
|
||||
(connector.config?.indexing_options as IndexingOptions | undefined) ||
|
||||
DEFAULT_INDEXING_OPTIONS;
|
||||
setSelectedFolders(folders);
|
||||
setSelectedFiles(files);
|
||||
setIndexingOptions(options);
|
||||
|
|
@ -125,11 +151,12 @@ export const GoogleDriveConfig: FC<ConnectorConfigProps> = ({ connector, onConfi
|
|||
{totalSelected > 0 && (
|
||||
<div className="p-2 sm:p-3 bg-muted rounded-lg text-xs sm:text-sm space-y-1 sm:space-y-2">
|
||||
<p className="font-medium">
|
||||
Selected {totalSelected} item{totalSelected > 1 ? "s" : ""}:{" "}
|
||||
{(() => {
|
||||
Selected {totalSelected} item{totalSelected > 1 ? "s" : ""}: {(() => {
|
||||
const parts: string[] = [];
|
||||
if (selectedFolders.length > 0) {
|
||||
parts.push(`${selectedFolders.length} folder${selectedFolders.length > 1 ? "s" : ""}`);
|
||||
parts.push(
|
||||
`${selectedFolders.length} folder${selectedFolders.length > 1 ? "s" : ""}`
|
||||
);
|
||||
}
|
||||
if (selectedFiles.length > 0) {
|
||||
parts.push(`${selectedFiles.length} file${selectedFiles.length > 1 ? "s" : ""}`);
|
||||
|
|
@ -259,9 +286,7 @@ export const GoogleDriveConfig: FC<ConnectorConfigProps> = ({ connector, onConfi
|
|||
<Switch
|
||||
id="incremental-sync"
|
||||
checked={indexingOptions.incremental_sync}
|
||||
onCheckedChange={(checked) =>
|
||||
handleIndexingOptionChange("incremental_sync", checked)
|
||||
}
|
||||
onCheckedChange={(checked) => handleIndexingOptionChange("incremental_sync", checked)}
|
||||
/>
|
||||
</div>
|
||||
|
||||
|
|
@ -278,9 +303,7 @@ export const GoogleDriveConfig: FC<ConnectorConfigProps> = ({ connector, onConfi
|
|||
<Switch
|
||||
id="include-subfolders"
|
||||
checked={indexingOptions.include_subfolders}
|
||||
onCheckedChange={(checked) =>
|
||||
handleIndexingOptionChange("include_subfolders", checked)
|
||||
}
|
||||
onCheckedChange={(checked) => handleIndexingOptionChange("include_subfolders", checked)}
|
||||
/>
|
||||
</div>
|
||||
</div>
|
||||
|
|
|
|||
|
|
@ -218,26 +218,36 @@ export const ConnectorEditView: FC<ConnectorEditViewProps> = ({
|
|||
/>
|
||||
)}
|
||||
|
||||
{/* Periodic sync - shown for all indexable connectors */}
|
||||
{(() => {
|
||||
// Check if Google Drive has folders/files selected
|
||||
const isGoogleDrive = connector.connector_type === "GOOGLE_DRIVE_CONNECTOR";
|
||||
const selectedFolders = (connector.config?.selected_folders as Array<{ id: string; name: string }> | undefined) || [];
|
||||
const selectedFiles = (connector.config?.selected_files as Array<{ id: string; name: string }> | undefined) || [];
|
||||
const hasItemsSelected = selectedFolders.length > 0 || selectedFiles.length > 0;
|
||||
const isDisabled = isGoogleDrive && !hasItemsSelected;
|
||||
|
||||
return (
|
||||
<PeriodicSyncConfig
|
||||
enabled={periodicEnabled}
|
||||
frequencyMinutes={frequencyMinutes}
|
||||
onEnabledChange={onPeriodicEnabledChange}
|
||||
onFrequencyChange={onFrequencyChange}
|
||||
disabled={isDisabled}
|
||||
disabledMessage={isDisabled ? "Select at least one folder or file above to enable periodic sync" : undefined}
|
||||
/>
|
||||
);
|
||||
})()}
|
||||
{/* Periodic sync - shown for all indexable connectors */}
|
||||
{(() => {
|
||||
// Check if Google Drive has folders/files selected
|
||||
const isGoogleDrive = connector.connector_type === "GOOGLE_DRIVE_CONNECTOR";
|
||||
const selectedFolders =
|
||||
(connector.config?.selected_folders as
|
||||
| Array<{ id: string; name: string }>
|
||||
| undefined) || [];
|
||||
const selectedFiles =
|
||||
(connector.config?.selected_files as
|
||||
| Array<{ id: string; name: string }>
|
||||
| undefined) || [];
|
||||
const hasItemsSelected = selectedFolders.length > 0 || selectedFiles.length > 0;
|
||||
const isDisabled = isGoogleDrive && !hasItemsSelected;
|
||||
|
||||
return (
|
||||
<PeriodicSyncConfig
|
||||
enabled={periodicEnabled}
|
||||
frequencyMinutes={frequencyMinutes}
|
||||
onEnabledChange={onPeriodicEnabledChange}
|
||||
onFrequencyChange={onFrequencyChange}
|
||||
disabled={isDisabled}
|
||||
disabledMessage={
|
||||
isDisabled
|
||||
? "Select at least one folder or file above to enable periodic sync"
|
||||
: undefined
|
||||
}
|
||||
/>
|
||||
);
|
||||
})()}
|
||||
</>
|
||||
)}
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue