feat: refactor folder indexing to support batch processing of multiple files, enhancing performance and error handling

This commit is contained in:
Anish Sarkar 2026-04-03 10:02:36 +05:30
parent e2ba509314
commit 1fa8e1cc83
6 changed files with 293 additions and 80 deletions

View file

@ -1275,7 +1275,7 @@ def index_local_folder_task(
file_extensions: list[str] | None = None,
root_folder_id: int | None = None,
enable_summary: bool = False,
target_file_path: str | None = None,
target_file_paths: list[str] | None = None,
):
"""Celery task to index a local folder. Config is passed directly — no connector row."""
loop = asyncio.new_event_loop()
@ -1292,7 +1292,7 @@ def index_local_folder_task(
file_extensions=file_extensions,
root_folder_id=root_folder_id,
enable_summary=enable_summary,
target_file_path=target_file_path,
target_file_paths=target_file_paths,
)
)
finally:
@ -1308,19 +1308,103 @@ async def _index_local_folder_async(
file_extensions: list[str] | None = None,
root_folder_id: int | None = None,
enable_summary: bool = False,
target_file_path: str | None = None,
target_file_paths: list[str] | None = None,
):
"""Run local folder indexing with a fresh DB session."""
"""Run local folder indexing with notification + heartbeat."""
is_batch = bool(target_file_paths)
is_full_scan = not target_file_paths
file_count = len(target_file_paths) if target_file_paths else None
if is_batch:
doc_name = f"{folder_name} ({file_count} file{'s' if file_count != 1 else ''})"
else:
doc_name = folder_name
notification = None
heartbeat_task = None
async with get_celery_session_maker()() as session:
await index_local_folder(
session=session,
search_space_id=search_space_id,
user_id=user_id,
folder_path=folder_path,
folder_name=folder_name,
exclude_patterns=exclude_patterns,
file_extensions=file_extensions,
root_folder_id=root_folder_id,
enable_summary=enable_summary,
target_file_path=target_file_path,
)
try:
notification = (
await NotificationService.document_processing.notify_processing_started(
session=session,
user_id=UUID(user_id),
document_type="LOCAL_FOLDER_FILE",
document_name=doc_name,
search_space_id=search_space_id,
)
)
_start_heartbeat(notification.id)
heartbeat_task = asyncio.create_task(
_run_heartbeat_loop(notification.id)
)
except Exception:
logger.warning(
"Failed to create notification for local folder indexing",
exc_info=True,
)
async def _heartbeat_progress(completed_count: int) -> None:
"""Refresh heartbeat and optionally update notification progress."""
if notification:
try:
await NotificationService.document_processing.notify_processing_progress(
session=session,
notification=notification,
stage="indexing",
stage_message=f"Syncing files ({completed_count}/{file_count or '?'})",
)
except Exception:
pass
try:
indexed, skipped_or_failed, _rfid, err = await index_local_folder(
session=session,
search_space_id=search_space_id,
user_id=user_id,
folder_path=folder_path,
folder_name=folder_name,
exclude_patterns=exclude_patterns,
file_extensions=file_extensions,
root_folder_id=root_folder_id,
enable_summary=enable_summary,
target_file_paths=target_file_paths,
on_heartbeat_callback=_heartbeat_progress if (is_batch or is_full_scan) else None,
)
if notification:
try:
if err:
await NotificationService.document_processing.notify_processing_completed(
session=session,
notification=notification,
error_message=err,
)
else:
await NotificationService.document_processing.notify_processing_completed(
session=session,
notification=notification,
)
except Exception:
logger.warning(
"Failed to update notification after local folder indexing",
exc_info=True,
)
except Exception as e:
logger.exception(f"Local folder indexing failed: {e}")
if notification:
try:
await NotificationService.document_processing.notify_processing_completed(
session=session,
notification=notification,
error_message=str(e)[:200],
)
except Exception:
pass
raise
finally:
if heartbeat_task:
heartbeat_task.cancel()
if notification:
_stop_heartbeat(notification.id)