feat: enhance folder indexing by adding root folder ID support and implement folder creation and cleanup logic

This commit is contained in:
Anish Sarkar 2026-04-02 22:41:45 +05:30
parent caf2525ab5
commit c27d24a117
5 changed files with 236 additions and 1 deletions

View file

@ -1280,6 +1280,7 @@ class FolderIndexFileRequest(PydanticBaseModel):
folder_name: str folder_name: str
search_space_id: int search_space_id: int
target_file_path: str target_file_path: str
root_folder_id: int | None = None
enable_summary: bool = False enable_summary: bool = False
@ -1394,6 +1395,7 @@ async def folder_index_file(
folder_path=request.folder_path, folder_path=request.folder_path,
folder_name=request.folder_name, folder_name=request.folder_name,
target_file_path=request.target_file_path, target_file_path=request.target_file_path,
root_folder_id=request.root_folder_id,
enable_summary=request.enable_summary, enable_summary=request.enable_summary,
) )

View file

@ -312,6 +312,92 @@ async def _mirror_folder_structure(
return mapping, root_folder_id return mapping, root_folder_id
async def _resolve_folder_for_file(
session: AsyncSession,
rel_path: str,
root_folder_id: int,
search_space_id: int,
user_id: str,
) -> int:
"""Given a file's relative path, ensure all parent Folder rows exist and
return the folder_id for the file's immediate parent directory.
For a file at "notes/daily/today.md", this ensures Folder rows exist for
"notes" and "notes/daily", and returns the id of "notes/daily".
For a file at "readme.md" (root level), returns root_folder_id.
"""
parent_dir = str(Path(rel_path).parent)
if parent_dir == ".":
return root_folder_id
parts = Path(parent_dir).parts
current_parent_id = root_folder_id
for part in parts:
existing = (
await session.execute(
select(Folder).where(
Folder.name == part,
Folder.parent_id == current_parent_id,
Folder.search_space_id == search_space_id,
)
)
).scalar_one_or_none()
if existing:
current_parent_id = existing.id
else:
new_folder = Folder(
name=part,
parent_id=current_parent_id,
search_space_id=search_space_id,
created_by_id=user_id,
position="a0",
)
session.add(new_folder)
await session.flush()
current_parent_id = new_folder.id
return current_parent_id
async def _cleanup_empty_folder_chain(
session: AsyncSession,
folder_id: int,
root_folder_id: int,
) -> None:
"""Walk up from folder_id toward root, deleting empty folders (no docs, no
children). Stops at root_folder_id which is never deleted."""
current_id = folder_id
while current_id and current_id != root_folder_id:
has_doc = (
await session.execute(
select(Document.id).where(Document.folder_id == current_id).limit(1)
)
).scalar_one_or_none()
if has_doc is not None:
break
has_child = (
await session.execute(
select(Folder.id).where(Folder.parent_id == current_id).limit(1)
)
).scalar_one_or_none()
if has_child is not None:
break
folder = (
await session.execute(select(Folder).where(Folder.id == current_id))
).scalar_one_or_none()
if not folder:
break
parent_id = folder.parent_id
await session.delete(folder)
await session.flush()
current_id = parent_id
async def _cleanup_empty_folders( async def _cleanup_empty_folders(
session: AsyncSession, session: AsyncSession,
root_folder_id: int, root_folder_id: int,
@ -427,6 +513,7 @@ async def index_local_folder(
folder_name=folder_name, folder_name=folder_name,
target_file_path=target_file_path, target_file_path=target_file_path,
enable_summary=enable_summary, enable_summary=enable_summary,
root_folder_id=root_folder_id,
task_logger=task_logger, task_logger=task_logger,
log_entry=log_entry, log_entry=log_entry,
) )
@ -802,6 +889,7 @@ async def _index_single_file(
folder_name: str, folder_name: str,
target_file_path: str, target_file_path: str,
enable_summary: bool, enable_summary: bool,
root_folder_id: int | None,
task_logger, task_logger,
log_entry, log_entry,
) -> tuple[int, int, str | None]: ) -> tuple[int, int, str | None]:
@ -816,7 +904,13 @@ async def _index_single_file(
) )
existing = await check_document_by_unique_identifier(session, uid_hash) existing = await check_document_by_unique_identifier(session, uid_hash)
if existing: if existing:
deleted_folder_id = existing.folder_id
await session.delete(existing) await session.delete(existing)
await session.flush()
if deleted_folder_id and root_folder_id:
await _cleanup_empty_folder_chain(
session, deleted_folder_id, root_folder_id
)
await session.commit() await session.commit()
return 0, 0, None return 0, 0, None
return 0, 0, None return 0, 0, None
@ -880,6 +974,12 @@ async def _index_single_file(
"mtime": mtime, "mtime": mtime,
} }
folder_id = None
if root_folder_id:
folder_id = await _resolve_folder_for_file(
session, rel_path, root_folder_id, search_space_id, user_id
)
if existing: if existing:
existing.title = title existing.title = title
existing.content = document_string existing.content = document_string
@ -887,6 +987,7 @@ async def _index_single_file(
existing.source_markdown = content existing.source_markdown = content
existing.embedding = embedding existing.embedding = embedding
existing.document_metadata = doc_metadata existing.document_metadata = doc_metadata
existing.folder_id = folder_id
await safe_set_chunks(session, existing, chunks) await safe_set_chunks(session, existing, chunks)
existing.updated_at = get_current_timestamp() existing.updated_at = get_current_timestamp()
existing.status = DocumentStatus.ready() existing.status = DocumentStatus.ready()
@ -905,6 +1006,7 @@ async def _index_single_file(
updated_at=get_current_timestamp(), updated_at=get_current_timestamp(),
created_by_id=user_id, created_by_id=user_id,
connector_id=None, connector_id=None,
folder_id=folder_id,
) )
session.add(document) session.add(document)
await session.flush() await session.flush()

View file

@ -567,6 +567,136 @@ class TestFolderMirroring:
).scalar_one_or_none() ).scalar_one_or_none()
assert daily_after is not None assert daily_after is not None
@pytest.mark.usefixtures(
"patched_self_hosted",
"patched_embed_for_indexer",
"patched_chunks_for_indexer",
"patched_summary_for_indexer",
)
async def test_f6_single_file_creates_subfolder(
self,
db_session: AsyncSession,
db_user: User,
db_search_space: SearchSpace,
tmp_path: Path,
):
"""F6: Single-file mode creates missing Folder rows and assigns correct folder_id."""
from app.tasks.connector_indexers.local_folder_indexer import index_local_folder
(tmp_path / "root.md").write_text("root")
_, _, root_folder_id, _ = await index_local_folder(
session=db_session,
search_space_id=db_search_space.id,
user_id=str(db_user.id),
folder_path=str(tmp_path),
folder_name="test-folder",
)
sub = tmp_path / "notes" / "daily"
sub.mkdir(parents=True)
(sub / "new.md").write_text("new note in subfolder")
count, _, _, _ = await index_local_folder(
session=db_session,
search_space_id=db_search_space.id,
user_id=str(db_user.id),
folder_path=str(tmp_path),
folder_name="test-folder",
target_file_path=str(sub / "new.md"),
root_folder_id=root_folder_id,
)
assert count == 1
doc = (
await db_session.execute(
select(Document).where(
Document.document_type == DocumentType.LOCAL_FOLDER_FILE,
Document.title == "new.md",
)
)
).scalar_one()
daily_folder = (
await db_session.execute(
select(Folder).where(Folder.name == "daily")
)
).scalar_one()
assert doc.folder_id == daily_folder.id
assert daily_folder.parent_id is not None
notes_folder = (
await db_session.execute(
select(Folder).where(Folder.name == "notes")
)
).scalar_one()
assert daily_folder.parent_id == notes_folder.id
assert notes_folder.parent_id == root_folder_id
@pytest.mark.usefixtures(
"patched_self_hosted",
"patched_embed_for_indexer",
"patched_chunks_for_indexer",
"patched_summary_for_indexer",
)
async def test_f7_single_file_delete_cleans_empty_folders(
self,
db_session: AsyncSession,
db_user: User,
db_search_space: SearchSpace,
tmp_path: Path,
):
"""F7: Deleting the only file in a subfolder via single-file mode removes empty Folder rows."""
from app.tasks.connector_indexers.local_folder_indexer import index_local_folder
sub = tmp_path / "notes" / "ephemeral"
sub.mkdir(parents=True)
(sub / "temp.md").write_text("temporary")
(tmp_path / "keep.md").write_text("keep this")
_, _, root_folder_id, _ = await index_local_folder(
session=db_session,
search_space_id=db_search_space.id,
user_id=str(db_user.id),
folder_path=str(tmp_path),
folder_name="test-folder",
)
eph_folder = (
await db_session.execute(
select(Folder).where(Folder.name == "ephemeral")
)
).scalar_one_or_none()
assert eph_folder is not None
target = sub / "temp.md"
target.unlink()
await index_local_folder(
session=db_session,
search_space_id=db_search_space.id,
user_id=str(db_user.id),
folder_path=str(tmp_path),
folder_name="test-folder",
target_file_path=str(target),
root_folder_id=root_folder_id,
)
eph_after = (
await db_session.execute(
select(Folder).where(Folder.name == "ephemeral")
)
).scalar_one_or_none()
assert eph_after is None
notes_after = (
await db_session.execute(
select(Folder).where(Folder.name == "notes")
)
).scalar_one_or_none()
assert notes_after is None
# ==================================================================== # ====================================================================
# Tier 5: Pipeline Integration (P1) # Tier 5: Pipeline Integration (P1)

View file

@ -32,6 +32,7 @@ export function useFolderSync() {
folder_name: event.folderName, folder_name: event.folderName,
search_space_id: event.searchSpaceId, search_space_id: event.searchSpaceId,
target_file_path: event.fullPath, target_file_path: event.fullPath,
root_folder_id: event.rootFolderId,
}); });
} catch (err) { } catch (err) {
console.error("[FolderSync] Failed to trigger re-index:", err); console.error("[FolderSync] Failed to trigger re-index:", err);

View file

@ -399,7 +399,7 @@ class DocumentsApiService {
return baseApiService.post(`/api/v1/documents/folder-index`, undefined, { body }); return baseApiService.post(`/api/v1/documents/folder-index`, undefined, { body });
}; };
folderIndexFile = async (searchSpaceId: number, body: { folder_path: string; folder_name: string; search_space_id: number; target_file_path: string; enable_summary?: boolean }) => { folderIndexFile = async (searchSpaceId: number, body: { folder_path: string; folder_name: string; search_space_id: number; target_file_path: string; root_folder_id?: number | null; enable_summary?: boolean }) => {
return baseApiService.post(`/api/v1/documents/folder-index-file`, undefined, { body }); return baseApiService.post(`/api/v1/documents/folder-index-file`, undefined, { body });
}; };