feat: enhance folder synchronization by integrating subtree ID retrieval and optimizing empty folder cleanup process

This commit is contained in:
Anish Sarkar 2026-04-08 17:10:22 +05:30
parent ae98f64760
commit cab0d1bdfe
2 changed files with 29 additions and 47 deletions

View file

@ -1675,6 +1675,7 @@ async def folder_sync_finalize(
gets deleted. gets deleted.
""" """
from app.indexing_pipeline.document_hashing import compute_identifier_hash from app.indexing_pipeline.document_hashing import compute_identifier_hash
from app.services.folder_service import get_folder_subtree_ids
from app.tasks.connector_indexers.local_folder_indexer import ( from app.tasks.connector_indexers.local_folder_indexer import (
_cleanup_empty_folders, _cleanup_empty_folders,
) )
@ -1687,6 +1688,11 @@ async def folder_sync_finalize(
"You don't have permission to delete documents in this search space", "You don't have permission to delete documents in this search space",
) )
if not request.root_folder_id:
return {"deleted_count": 0}
subtree_ids = await get_folder_subtree_ids(session, request.root_folder_id)
seen_hashes: set[str] = set() seen_hashes: set[str] = set()
for rel_path in request.all_relative_paths: for rel_path in request.all_relative_paths:
unique_id = f"{request.folder_name}:{rel_path}" unique_id = f"{request.folder_name}:{rel_path}"
@ -1697,32 +1703,13 @@ async def folder_sync_finalize(
) )
seen_hashes.add(uid_hash) seen_hashes.add(uid_hash)
all_root_folder_ids: set[int] = set()
if request.root_folder_id:
all_root_folder_ids.add(request.root_folder_id)
all_db_folders = (
(
await session.execute(
select(Folder.id).where(
Folder.search_space_id == request.search_space_id,
)
)
)
.scalars()
.all()
)
all_root_folder_ids.update(all_db_folders)
all_folder_docs = ( all_folder_docs = (
( (
await session.execute( await session.execute(
select(Document).where( select(Document).where(
Document.document_type == DocumentType.LOCAL_FOLDER_FILE, Document.document_type == DocumentType.LOCAL_FOLDER_FILE,
Document.search_space_id == request.search_space_id, Document.search_space_id == request.search_space_id,
Document.folder_id.in_(list(all_root_folder_ids)) Document.folder_id.in_(subtree_ids),
if all_root_folder_ids
else True,
) )
) )
) )
@ -1738,24 +1725,22 @@ async def folder_sync_finalize(
await session.flush() await session.flush()
if request.root_folder_id: existing_dirs: set[str] = set()
existing_dirs: set[str] = set() for rel_path in request.all_relative_paths:
for rel_path in request.all_relative_paths: parent = str(os.path.dirname(rel_path))
parent = str(os.path.dirname(rel_path)) if parent and parent != ".":
if parent and parent != ".": existing_dirs.add(parent)
existing_dirs.add(parent)
folder_mapping: dict[str, int] = {} folder_mapping: dict[str, int] = {"": request.root_folder_id}
if request.root_folder_id:
folder_mapping[""] = request.root_folder_id
await _cleanup_empty_folders( await _cleanup_empty_folders(
session, session,
request.root_folder_id, request.root_folder_id,
request.search_space_id, request.search_space_id,
existing_dirs, existing_dirs,
folder_mapping, folder_mapping,
) subtree_ids=subtree_ids,
)
await session.commit() await session.commit()
return {"deleted_count": deleted_count} return {"deleted_count": deleted_count}

View file

@ -387,24 +387,21 @@ async def _cleanup_empty_folders(
search_space_id: int, search_space_id: int,
existing_dirs_on_disk: set[str], existing_dirs_on_disk: set[str],
folder_mapping: dict[str, int], folder_mapping: dict[str, int],
subtree_ids: list[int] | None = None,
) -> None: ) -> None:
"""Delete Folder rows that are empty (no docs, no children) and no longer on disk.""" """Delete Folder rows that are empty (no docs, no children) and no longer on disk."""
from sqlalchemy import delete as sa_delete from sqlalchemy import delete as sa_delete
id_to_rel: dict[int, str] = {fid: rel for rel, fid in folder_mapping.items() if rel} id_to_rel: dict[int, str] = {fid: rel for rel, fid in folder_mapping.items() if rel}
all_folders = ( query = select(Folder).where(
( Folder.search_space_id == search_space_id,
await session.execute( Folder.id != root_folder_id,
select(Folder).where(
Folder.search_space_id == search_space_id,
Folder.id != root_folder_id,
)
)
)
.scalars()
.all()
) )
if subtree_ids is not None:
query = query.where(Folder.id.in_(subtree_ids))
all_folders = (await session.execute(query)).scalars().all()
candidates: list[Folder] = [] candidates: list[Folder] = []
for folder in all_folders: for folder in all_folders: