diff --git a/surfsense_backend/app/schemas/__init__.py b/surfsense_backend/app/schemas/__init__.py
index df31aed89..017c78577 100644
--- a/surfsense_backend/app/schemas/__init__.py
+++ b/surfsense_backend/app/schemas/__init__.py
@@ -90,11 +90,11 @@ __all__ = [
"DocumentsCreate",
# Google Drive schemas
"DriveItem",
- "GoogleDriveIndexingOptions",
"ExtensionDocumentContent",
"ExtensionDocumentMetadata",
"GlobalNewLLMConfigRead",
"GoogleDriveIndexRequest",
+ "GoogleDriveIndexingOptions",
# Base schemas
"IDModel",
# RBAC schemas
diff --git a/surfsense_backend/app/tasks/celery_tasks/schedule_checker_task.py b/surfsense_backend/app/tasks/celery_tasks/schedule_checker_task.py
index e8c2a728c..21855f73f 100644
--- a/surfsense_backend/app/tasks/celery_tasks/schedule_checker_task.py
+++ b/surfsense_backend/app/tasks/celery_tasks/schedule_checker_task.py
@@ -110,15 +110,21 @@ async def _check_and_trigger_schedules():
)
# Special handling for Google Drive - uses config for folder/file selection
- if connector.connector_type == SearchSourceConnectorType.GOOGLE_DRIVE_CONNECTOR:
+ if (
+ connector.connector_type
+ == SearchSourceConnectorType.GOOGLE_DRIVE_CONNECTOR
+ ):
connector_config = connector.config or {}
selected_folders = connector_config.get("selected_folders", [])
selected_files = connector_config.get("selected_files", [])
- indexing_options = connector_config.get("indexing_options", {
- "max_files_per_folder": 100,
- "incremental_sync": True,
- "include_subfolders": True,
- })
+ indexing_options = connector_config.get(
+ "indexing_options",
+ {
+ "max_files_per_folder": 100,
+ "incremental_sync": True,
+ "include_subfolders": True,
+ },
+ )
if selected_folders or selected_files:
task.delay(
@@ -139,6 +145,7 @@ async def _check_and_trigger_schedules():
"skipping periodic indexing (will check again at next scheduled time)"
)
from datetime import timedelta
+
connector.next_scheduled_at = now + timedelta(
minutes=connector.indexing_frequency_minutes
)
diff --git a/surfsense_backend/app/tasks/connector_indexers/google_drive_indexer.py b/surfsense_backend/app/tasks/connector_indexers/google_drive_indexer.py
index f655c290d..48282a1af 100644
--- a/surfsense_backend/app/tasks/connector_indexers/google_drive_indexer.py
+++ b/surfsense_backend/app/tasks/connector_indexers/google_drive_indexer.py
@@ -388,7 +388,11 @@ async def _index_full_scan(
await task_logger.log_task_progress(
log_entry,
f"Starting full scan of folder: {folder_name} (include_subfolders={include_subfolders})",
- {"stage": "full_scan", "folder_id": folder_id, "include_subfolders": include_subfolders},
+ {
+ "stage": "full_scan",
+ "folder_id": folder_id,
+ "include_subfolders": include_subfolders,
+ },
)
documents_indexed = 0
@@ -407,7 +411,10 @@ async def _index_full_scan(
# Get files and folders in current folder
# include_subfolders=True here so we get folder items to queue them
files, next_token, error = await get_files_in_folder(
- drive_client, current_folder_id, include_subfolders=True, page_token=page_token
+ drive_client,
+ current_folder_id,
+ include_subfolders=True,
+ page_token=page_token,
)
if error:
@@ -426,7 +433,9 @@ async def _index_full_scan(
# If this is a folder and include_subfolders is enabled, queue it for processing
if mime_type == "application/vnd.google-apps.folder":
if include_subfolders:
- folders_to_process.append((file["id"], file.get("name", "Unknown")))
+ folders_to_process.append(
+ (file["id"], file.get("name", "Unknown"))
+ )
logger.debug(f"Queued subfolder: {file.get('name', 'Unknown')}")
continue
@@ -478,7 +487,7 @@ async def _index_with_delta_sync(
include_subfolders: bool = False,
) -> tuple[int, int]:
"""Perform delta sync indexing using change tracking.
-
+
Note: include_subfolders is accepted for API consistency but delta sync
automatically tracks changes across all folders including subfolders.
"""
@@ -554,16 +563,16 @@ async def _check_rename_only_update(
) -> tuple[bool, str | None]:
"""
Check if a file only needs a rename update (no content change).
-
+
Uses md5Checksum comparison (preferred) or modifiedTime (fallback for Google Workspace files)
to detect if content has changed. This optimization prevents unnecessary ETL API calls
(Docling/LlamaCloud) for rename-only operations.
-
+
Args:
session: Database session
file: File metadata from Google Drive API
search_space_id: ID of the search space
-
+
Returns:
Tuple of (is_rename_only, message)
- (True, message): Only filename changed, document was updated
@@ -571,69 +580,76 @@ async def _check_rename_only_update(
"""
from sqlalchemy import select
from sqlalchemy.orm.attributes import flag_modified
+
from app.db import Document
-
+
file_id = file.get("id")
file_name = file.get("name", "Unknown")
incoming_md5 = file.get("md5Checksum") # None for Google Workspace files
incoming_modified_time = file.get("modifiedTime")
-
+
if not file_id:
return False, None
-
+
# Try to find existing document by file_id-based hash (primary method)
primary_hash = generate_unique_identifier_hash(
DocumentType.GOOGLE_DRIVE_FILE, file_id, search_space_id
)
existing_document = await check_document_by_unique_identifier(session, primary_hash)
-
+
# If not found by primary hash, try searching by metadata (for legacy documents)
if not existing_document:
result = await session.execute(
select(Document).where(
Document.search_space_id == search_space_id,
Document.document_type == DocumentType.GOOGLE_DRIVE_FILE,
- Document.document_metadata["google_drive_file_id"].astext == file_id
+ Document.document_metadata["google_drive_file_id"].astext == file_id,
)
)
existing_document = result.scalar_one_or_none()
if existing_document:
logger.debug(f"Found legacy document by metadata for file_id: {file_id}")
-
+
if not existing_document:
# New file, needs full processing
return False, None
-
+
# Get stored checksums/timestamps from document metadata
doc_metadata = existing_document.document_metadata or {}
stored_md5 = doc_metadata.get("md5_checksum")
stored_modified_time = doc_metadata.get("modified_time")
-
+
# Determine if content changed using md5Checksum (preferred) or modifiedTime (fallback)
content_unchanged = False
-
+
if incoming_md5 and stored_md5:
# Best case: Compare md5 checksums (only changes when content changes, not on rename)
- content_unchanged = (incoming_md5 == stored_md5)
+ content_unchanged = incoming_md5 == stored_md5
logger.debug(f"MD5 comparison for {file_name}: unchanged={content_unchanged}")
elif incoming_md5 and not stored_md5:
# Have incoming md5 but no stored md5 (legacy doc) - need to reprocess to store it
- logger.debug(f"No stored md5 for {file_name}, will reprocess to store md5_checksum")
+ logger.debug(
+ f"No stored md5 for {file_name}, will reprocess to store md5_checksum"
+ )
return False, None
elif not incoming_md5:
# Google Workspace file (no md5Checksum available) - fall back to modifiedTime
# Note: modifiedTime is less reliable as it changes on rename too, but it's the best we have
if incoming_modified_time and stored_modified_time:
- content_unchanged = (incoming_modified_time == stored_modified_time)
- logger.debug(f"ModifiedTime fallback for Google Workspace file {file_name}: unchanged={content_unchanged}")
+ content_unchanged = incoming_modified_time == stored_modified_time
+ logger.debug(
+ f"ModifiedTime fallback for Google Workspace file {file_name}: unchanged={content_unchanged}"
+ )
else:
# No stored modifiedTime (legacy) - reprocess to store it
return False, None
-
+
if content_unchanged:
# Content hasn't changed - check if filename changed
- old_name = doc_metadata.get("FILE_NAME") or doc_metadata.get("google_drive_file_name")
-
+ old_name = doc_metadata.get("FILE_NAME") or doc_metadata.get(
+ "google_drive_file_name"
+ )
+
if old_name and old_name != file_name:
# Rename-only update - update the document without re-processing
existing_document.title = file_name
@@ -643,17 +659,24 @@ async def _check_rename_only_update(
existing_document.document_metadata["google_drive_file_name"] = file_name
# Also update modified_time for Google Workspace files (since it changed on rename)
if incoming_modified_time:
- existing_document.document_metadata["modified_time"] = incoming_modified_time
+ existing_document.document_metadata["modified_time"] = (
+ incoming_modified_time
+ )
flag_modified(existing_document, "document_metadata")
await session.commit()
-
- logger.info(f"Rename-only update: '{old_name}' → '{file_name}' (skipped ETL)")
- return True, f"File renamed: '{old_name}' → '{file_name}' (no content change)"
+
+ logger.info(
+ f"Rename-only update: '{old_name}' → '{file_name}' (skipped ETL)"
+ )
+ return (
+ True,
+ f"File renamed: '{old_name}' → '{file_name}' (no content change)",
+ )
else:
# Neither content nor name changed
logger.debug(f"File unchanged: {file_name}")
return True, "File unchanged (same content and name)"
-
+
# Content changed - needs full processing
return False, None
@@ -688,7 +711,7 @@ async def _process_single_file(
file=file,
search_space_id=search_space_id,
)
-
+
if is_rename_only:
await task_logger.log_task_progress(
log_entry,
@@ -729,12 +752,13 @@ async def _process_single_file(
async def _remove_document(session: AsyncSession, file_id: str, search_space_id: int):
"""Remove a document that was deleted in Drive.
-
+
Handles both new (file_id-based) and legacy (filename-based) hash schemes.
"""
from sqlalchemy import select
+
from app.db import Document
-
+
# First try with file_id-based hash (new method)
unique_identifier_hash = generate_unique_identifier_hash(
DocumentType.GOOGLE_DRIVE_FILE, file_id, search_space_id
@@ -750,7 +774,7 @@ async def _remove_document(session: AsyncSession, file_id: str, search_space_id:
select(Document).where(
Document.search_space_id == search_space_id,
Document.document_type == DocumentType.GOOGLE_DRIVE_FILE,
- Document.document_metadata["google_drive_file_id"].astext == file_id
+ Document.document_metadata["google_drive_file_id"].astext == file_id,
)
)
existing_document = result.scalar_one_or_none()
diff --git a/surfsense_backend/app/tasks/document_processors/file_processors.py b/surfsense_backend/app/tasks/document_processors/file_processors.py
index 96f0af756..0a22c20c2 100644
--- a/surfsense_backend/app/tasks/document_processors/file_processors.py
+++ b/surfsense_backend/app/tasks/document_processors/file_processors.py
@@ -57,15 +57,15 @@ def get_google_drive_unique_identifier(
) -> tuple[str, str | None]:
"""
Get unique identifier hash for a file, with special handling for Google Drive.
-
+
For Google Drive files, uses file_id as the unique identifier (doesn't change on rename).
For other files, uses filename.
-
+
Args:
connector: Optional connector info dict with type and metadata
filename: The filename (used for non-Google Drive files or as fallback)
search_space_id: The search space ID
-
+
Returns:
Tuple of (primary_hash, legacy_hash or None)
- For Google Drive: (file_id_based_hash, filename_based_hash for migration)
@@ -74,7 +74,7 @@ def get_google_drive_unique_identifier(
if connector and connector.get("type") == DocumentType.GOOGLE_DRIVE_FILE:
metadata = connector.get("metadata", {})
file_id = metadata.get("google_drive_file_id")
-
+
if file_id:
# New method: use file_id as unique identifier (doesn't change on rename)
primary_hash = generate_unique_identifier_hash(
@@ -86,7 +86,7 @@ def get_google_drive_unique_identifier(
DocumentType.GOOGLE_DRIVE_FILE, filename, search_space_id
)
return primary_hash, legacy_hash
-
+
# For non-Google Drive files, use filename as before
primary_hash = generate_unique_identifier_hash(
DocumentType.FILE, filename, search_space_id
@@ -104,7 +104,7 @@ async def handle_existing_document_update(
) -> tuple[bool, Document | None]:
"""
Handle update logic for an existing document.
-
+
Args:
session: Database session
existing_document: The existing document found in database
@@ -112,7 +112,7 @@ async def handle_existing_document_update(
connector: Optional connector info
filename: Current filename
primary_hash: The primary hash (file_id based for Google Drive)
-
+
Returns:
Tuple of (should_skip_processing, document_to_return)
- (True, document): Content unchanged, just return existing document
@@ -122,7 +122,7 @@ async def handle_existing_document_update(
if existing_document.unique_identifier_hash != primary_hash:
existing_document.unique_identifier_hash = primary_hash
logging.info(f"Migrated document to file_id-based identifier: {filename}")
-
+
# Check if content has changed
if existing_document.content_hash == content_hash:
# Content unchanged - check if we need to update metadata (e.g., filename changed)
@@ -131,12 +131,14 @@ async def handle_existing_document_update(
new_name = connector_metadata.get("google_drive_file_name")
# Check both possible keys for old name (FILE_NAME is used in stored documents)
doc_metadata = existing_document.document_metadata or {}
- old_name = doc_metadata.get("FILE_NAME") or doc_metadata.get("google_drive_file_name")
-
+ old_name = doc_metadata.get("FILE_NAME") or doc_metadata.get(
+ "google_drive_file_name"
+ )
+
if new_name and old_name and old_name != new_name:
# File was renamed - update title and metadata, skip expensive processing
from sqlalchemy.orm.attributes import flag_modified
-
+
existing_document.title = new_name
if not existing_document.document_metadata:
existing_document.document_metadata = {}
@@ -144,8 +146,10 @@ async def handle_existing_document_update(
existing_document.document_metadata["google_drive_file_name"] = new_name
flag_modified(existing_document, "document_metadata")
await session.commit()
- logging.info(f"File renamed in Google Drive: '{old_name}' → '{new_name}' (no re-processing needed)")
-
+ logging.info(
+ f"File renamed in Google Drive: '{old_name}' → '{new_name}' (no re-processing needed)"
+ )
+
logging.info(f"Document for file {filename} unchanged. Skipping.")
return True, existing_document
else:
@@ -163,25 +167,29 @@ async def find_existing_document_with_migration(
"""
Find existing document, checking both new hash and legacy hash for migration,
with fallback to content_hash for cross-source deduplication.
-
+
Args:
session: Database session
primary_hash: The primary hash (file_id based for Google Drive)
legacy_hash: The legacy hash (filename based) for migration, or None
content_hash: The content hash for fallback deduplication, or None
-
+
Returns:
Existing document if found, None otherwise
"""
# First check with primary hash (new method)
existing_document = await check_document_by_unique_identifier(session, primary_hash)
-
+
# If not found and we have a legacy hash, check with that (migration path)
if not existing_document and legacy_hash:
- existing_document = await check_document_by_unique_identifier(session, legacy_hash)
+ existing_document = await check_document_by_unique_identifier(
+ session, legacy_hash
+ )
if existing_document:
- logging.info("Found legacy document (filename-based hash), will migrate to file_id-based hash")
-
+ logging.info(
+ "Found legacy document (filename-based hash), will migrate to file_id-based hash"
+ )
+
# Fallback: check by content_hash to catch duplicates from different sources
# This prevents unique constraint violations when the same content exists
# under a different unique_identifier (e.g., manual upload vs Google Drive)
@@ -192,7 +200,7 @@ async def find_existing_document_with_migration(
f"Found duplicate content from different source (content_hash match). "
f"Original document ID: {existing_document.id}, type: {existing_document.document_type}"
)
-
+
return existing_document
@@ -342,7 +350,12 @@ async def add_received_file_document_using_unstructured(
if existing_document:
# Handle existing document (rename detection, content change check)
should_skip, doc = await handle_existing_document_update(
- session, existing_document, content_hash, connector, file_name, primary_hash
+ session,
+ existing_document,
+ content_hash,
+ connector,
+ file_name,
+ primary_hash,
)
if should_skip:
return doc
@@ -402,7 +415,7 @@ async def add_received_file_document_using_unstructured(
doc_type = DocumentType.FILE
if connector and connector.get("type") == DocumentType.GOOGLE_DRIVE_FILE:
doc_type = DocumentType.GOOGLE_DRIVE_FILE
-
+
document = Document(
search_space_id=search_space_id,
title=file_name,
@@ -476,7 +489,12 @@ async def add_received_file_document_using_llamacloud(
if existing_document:
# Handle existing document (rename detection, content change check)
should_skip, doc = await handle_existing_document_update(
- session, existing_document, content_hash, connector, file_name, primary_hash
+ session,
+ existing_document,
+ content_hash,
+ connector,
+ file_name,
+ primary_hash,
)
if should_skip:
return doc
@@ -536,7 +554,7 @@ async def add_received_file_document_using_llamacloud(
doc_type = DocumentType.FILE
if connector and connector.get("type") == DocumentType.GOOGLE_DRIVE_FILE:
doc_type = DocumentType.GOOGLE_DRIVE_FILE
-
+
document = Document(
search_space_id=search_space_id,
title=file_name,
@@ -611,7 +629,12 @@ async def add_received_file_document_using_docling(
if existing_document:
# Handle existing document (rename detection, content change check)
should_skip, doc = await handle_existing_document_update(
- session, existing_document, content_hash, connector, file_name, primary_hash
+ session,
+ existing_document,
+ content_hash,
+ connector,
+ file_name,
+ primary_hash,
)
if should_skip:
return doc
@@ -695,7 +718,7 @@ async def add_received_file_document_using_docling(
doc_type = DocumentType.FILE
if connector and connector.get("type") == DocumentType.GOOGLE_DRIVE_FILE:
doc_type = DocumentType.GOOGLE_DRIVE_FILE
-
+
document = Document(
search_space_id=search_space_id,
title=file_name,
diff --git a/surfsense_backend/app/tasks/document_processors/markdown_processor.py b/surfsense_backend/app/tasks/document_processors/markdown_processor.py
index e66aa9170..3a9867fd6 100644
--- a/surfsense_backend/app/tasks/document_processors/markdown_processor.py
+++ b/surfsense_backend/app/tasks/document_processors/markdown_processor.py
@@ -31,22 +31,22 @@ def _get_google_drive_unique_identifier(
) -> tuple[str, str | None]:
"""
Get unique identifier hash for a file, with special handling for Google Drive.
-
+
For Google Drive files, uses file_id as the unique identifier (doesn't change on rename).
For other files, uses filename.
-
+
Args:
connector: Optional connector info dict with type and metadata
filename: The filename (used for non-Google Drive files or as fallback)
search_space_id: The search space ID
-
+
Returns:
Tuple of (primary_hash, legacy_hash or None)
"""
if connector and connector.get("type") == DocumentType.GOOGLE_DRIVE_FILE:
metadata = connector.get("metadata", {})
file_id = metadata.get("google_drive_file_id")
-
+
if file_id:
primary_hash = generate_unique_identifier_hash(
DocumentType.GOOGLE_DRIVE_FILE, file_id, search_space_id
@@ -55,7 +55,7 @@ def _get_google_drive_unique_identifier(
DocumentType.GOOGLE_DRIVE_FILE, filename, search_space_id
)
return primary_hash, legacy_hash
-
+
primary_hash = generate_unique_identifier_hash(
DocumentType.FILE, filename, search_space_id
)
@@ -73,12 +73,16 @@ async def _find_existing_document_with_migration(
with fallback to content_hash for cross-source deduplication.
"""
existing_document = await check_document_by_unique_identifier(session, primary_hash)
-
+
if not existing_document and legacy_hash:
- existing_document = await check_document_by_unique_identifier(session, legacy_hash)
+ existing_document = await check_document_by_unique_identifier(
+ session, legacy_hash
+ )
if existing_document:
- logging.info("Found legacy document (filename-based hash), will migrate to file_id-based hash")
-
+ logging.info(
+ "Found legacy document (filename-based hash), will migrate to file_id-based hash"
+ )
+
# Fallback: check by content_hash to catch duplicates from different sources
if not existing_document and content_hash:
existing_document = await check_duplicate_document(session, content_hash)
@@ -87,7 +91,7 @@ async def _find_existing_document_with_migration(
f"Found duplicate content from different source (content_hash match). "
f"Original document ID: {existing_document.id}, type: {existing_document.document_type}"
)
-
+
return existing_document
@@ -103,7 +107,7 @@ async def _handle_existing_document_update(
) -> tuple[bool, Document | None]:
"""
Handle update logic for an existing document.
-
+
Returns:
Tuple of (should_skip_processing, document_to_return)
"""
@@ -111,7 +115,7 @@ async def _handle_existing_document_update(
if existing_document.unique_identifier_hash != primary_hash:
existing_document.unique_identifier_hash = primary_hash
logging.info(f"Migrated document to file_id-based identifier: {filename}")
-
+
# Check if content has changed
if existing_document.content_hash == content_hash:
# Content unchanged - check if we need to update metadata (e.g., filename changed)
@@ -120,12 +124,16 @@ async def _handle_existing_document_update(
new_name = connector_metadata.get("google_drive_file_name")
# Check both possible keys for old name (FILE_NAME is used in stored documents)
doc_metadata = existing_document.document_metadata or {}
- old_name = doc_metadata.get("FILE_NAME") or doc_metadata.get("google_drive_file_name") or doc_metadata.get("file_name")
-
+ old_name = (
+ doc_metadata.get("FILE_NAME")
+ or doc_metadata.get("google_drive_file_name")
+ or doc_metadata.get("file_name")
+ )
+
if new_name and old_name and old_name != new_name:
# File was renamed - update title and metadata, skip expensive processing
from sqlalchemy.orm.attributes import flag_modified
-
+
existing_document.title = new_name
if not existing_document.document_metadata:
existing_document.document_metadata = {}
@@ -134,8 +142,10 @@ async def _handle_existing_document_update(
existing_document.document_metadata["google_drive_file_name"] = new_name
flag_modified(existing_document, "document_metadata")
await session.commit()
- logging.info(f"File renamed in Google Drive: '{old_name}' → '{new_name}' (no re-processing needed)")
-
+ logging.info(
+ f"File renamed in Google Drive: '{old_name}' → '{new_name}' (no re-processing needed)"
+ )
+
await task_logger.log_task_success(
log_entry,
f"Markdown file document unchanged: {filename}",
@@ -147,7 +157,9 @@ async def _handle_existing_document_update(
logging.info(f"Document for markdown file {filename} unchanged. Skipping.")
return True, existing_document
else:
- logging.info(f"Content changed for markdown file {filename}. Updating document.")
+ logging.info(
+ f"Content changed for markdown file {filename}. Updating document."
+ )
return False, None
@@ -204,8 +216,14 @@ async def add_received_markdown_file_document(
if existing_document:
# Handle existing document (rename detection, content change check)
should_skip, doc = await _handle_existing_document_update(
- session, existing_document, content_hash, connector, file_name, primary_hash,
- task_logger, log_entry
+ session,
+ existing_document,
+ content_hash,
+ connector,
+ file_name,
+ primary_hash,
+ task_logger,
+ log_entry,
)
if should_skip:
return doc
@@ -262,7 +280,7 @@ async def add_received_markdown_file_document(
doc_type = DocumentType.FILE
if connector and connector.get("type") == DocumentType.GOOGLE_DRIVE_FILE:
doc_type = DocumentType.GOOGLE_DRIVE_FILE
-
+
document = Document(
search_space_id=search_space_id,
title=file_name,
diff --git a/surfsense_web/components/assistant-ui/connector-popup/components/periodic-sync-config.tsx b/surfsense_web/components/assistant-ui/connector-popup/components/periodic-sync-config.tsx
index aaf52a01f..1d52f0182 100644
--- a/surfsense_web/components/assistant-ui/connector-popup/components/periodic-sync-config.tsx
+++ b/surfsense_web/components/assistant-ui/connector-popup/components/periodic-sync-config.tsx
@@ -38,11 +38,7 @@ export const PeriodicSyncConfig: FC
- Selected {totalSelected} item{totalSelected > 1 ? "s" : ""}:{" "}
- {(() => {
+ Selected {totalSelected} item{totalSelected > 1 ? "s" : ""}: {(() => {
const parts: string[] = [];
if (selectedFolders.length > 0) {
- parts.push(`${selectedFolders.length} folder${selectedFolders.length > 1 ? "s" : ""}`);
+ parts.push(
+ `${selectedFolders.length} folder${selectedFolders.length > 1 ? "s" : ""}`
+ );
}
if (selectedFiles.length > 0) {
parts.push(`${selectedFiles.length} file${selectedFiles.length > 1 ? "s" : ""}`);
@@ -259,9 +286,7 @@ export const GoogleDriveConfig: FC