From c27d24a117633aac32de889b12f153239b58a832 Mon Sep 17 00:00:00 2001 From: Anish Sarkar <104695310+AnishSarkar22@users.noreply.github.com> Date: Thu, 2 Apr 2026 22:41:45 +0530 Subject: [PATCH] feat: enhance folder indexing by adding root folder ID support and implement folder creation and cleanup logic --- .../app/routes/documents_routes.py | 2 + .../local_folder_indexer.py | 102 ++++++++++++++ .../test_local_folder_pipeline.py | 130 ++++++++++++++++++ surfsense_web/hooks/use-folder-sync.ts | 1 + .../lib/apis/documents-api.service.ts | 2 +- 5 files changed, 236 insertions(+), 1 deletion(-) diff --git a/surfsense_backend/app/routes/documents_routes.py b/surfsense_backend/app/routes/documents_routes.py index d7974f9ff..05221b192 100644 --- a/surfsense_backend/app/routes/documents_routes.py +++ b/surfsense_backend/app/routes/documents_routes.py @@ -1280,6 +1280,7 @@ class FolderIndexFileRequest(PydanticBaseModel): folder_name: str search_space_id: int target_file_path: str + root_folder_id: int | None = None enable_summary: bool = False @@ -1394,6 +1395,7 @@ async def folder_index_file( folder_path=request.folder_path, folder_name=request.folder_name, target_file_path=request.target_file_path, + root_folder_id=request.root_folder_id, enable_summary=request.enable_summary, ) diff --git a/surfsense_backend/app/tasks/connector_indexers/local_folder_indexer.py b/surfsense_backend/app/tasks/connector_indexers/local_folder_indexer.py index 93c6649a2..3d4ddc19e 100644 --- a/surfsense_backend/app/tasks/connector_indexers/local_folder_indexer.py +++ b/surfsense_backend/app/tasks/connector_indexers/local_folder_indexer.py @@ -312,6 +312,92 @@ async def _mirror_folder_structure( return mapping, root_folder_id +async def _resolve_folder_for_file( + session: AsyncSession, + rel_path: str, + root_folder_id: int, + search_space_id: int, + user_id: str, +) -> int: + """Given a file's relative path, ensure all parent Folder rows exist and + return the folder_id for the file's immediate parent directory. + + For a file at "notes/daily/today.md", this ensures Folder rows exist for + "notes" and "notes/daily", and returns the id of "notes/daily". + For a file at "readme.md" (root level), returns root_folder_id. + """ + parent_dir = str(Path(rel_path).parent) + if parent_dir == ".": + return root_folder_id + + parts = Path(parent_dir).parts + current_parent_id = root_folder_id + + for part in parts: + existing = ( + await session.execute( + select(Folder).where( + Folder.name == part, + Folder.parent_id == current_parent_id, + Folder.search_space_id == search_space_id, + ) + ) + ).scalar_one_or_none() + + if existing: + current_parent_id = existing.id + else: + new_folder = Folder( + name=part, + parent_id=current_parent_id, + search_space_id=search_space_id, + created_by_id=user_id, + position="a0", + ) + session.add(new_folder) + await session.flush() + current_parent_id = new_folder.id + + return current_parent_id + + +async def _cleanup_empty_folder_chain( + session: AsyncSession, + folder_id: int, + root_folder_id: int, +) -> None: + """Walk up from folder_id toward root, deleting empty folders (no docs, no + children). Stops at root_folder_id which is never deleted.""" + current_id = folder_id + while current_id and current_id != root_folder_id: + has_doc = ( + await session.execute( + select(Document.id).where(Document.folder_id == current_id).limit(1) + ) + ).scalar_one_or_none() + if has_doc is not None: + break + + has_child = ( + await session.execute( + select(Folder.id).where(Folder.parent_id == current_id).limit(1) + ) + ).scalar_one_or_none() + if has_child is not None: + break + + folder = ( + await session.execute(select(Folder).where(Folder.id == current_id)) + ).scalar_one_or_none() + if not folder: + break + + parent_id = folder.parent_id + await session.delete(folder) + await session.flush() + current_id = parent_id + + async def _cleanup_empty_folders( session: AsyncSession, root_folder_id: int, @@ -427,6 +513,7 @@ async def index_local_folder( folder_name=folder_name, target_file_path=target_file_path, enable_summary=enable_summary, + root_folder_id=root_folder_id, task_logger=task_logger, log_entry=log_entry, ) @@ -802,6 +889,7 @@ async def _index_single_file( folder_name: str, target_file_path: str, enable_summary: bool, + root_folder_id: int | None, task_logger, log_entry, ) -> tuple[int, int, str | None]: @@ -816,7 +904,13 @@ async def _index_single_file( ) existing = await check_document_by_unique_identifier(session, uid_hash) if existing: + deleted_folder_id = existing.folder_id await session.delete(existing) + await session.flush() + if deleted_folder_id and root_folder_id: + await _cleanup_empty_folder_chain( + session, deleted_folder_id, root_folder_id + ) await session.commit() return 0, 0, None return 0, 0, None @@ -880,6 +974,12 @@ async def _index_single_file( "mtime": mtime, } + folder_id = None + if root_folder_id: + folder_id = await _resolve_folder_for_file( + session, rel_path, root_folder_id, search_space_id, user_id + ) + if existing: existing.title = title existing.content = document_string @@ -887,6 +987,7 @@ async def _index_single_file( existing.source_markdown = content existing.embedding = embedding existing.document_metadata = doc_metadata + existing.folder_id = folder_id await safe_set_chunks(session, existing, chunks) existing.updated_at = get_current_timestamp() existing.status = DocumentStatus.ready() @@ -905,6 +1006,7 @@ async def _index_single_file( updated_at=get_current_timestamp(), created_by_id=user_id, connector_id=None, + folder_id=folder_id, ) session.add(document) await session.flush() diff --git a/surfsense_backend/tests/integration/indexing_pipeline/test_local_folder_pipeline.py b/surfsense_backend/tests/integration/indexing_pipeline/test_local_folder_pipeline.py index 34efad789..110aa6caf 100644 --- a/surfsense_backend/tests/integration/indexing_pipeline/test_local_folder_pipeline.py +++ b/surfsense_backend/tests/integration/indexing_pipeline/test_local_folder_pipeline.py @@ -567,6 +567,136 @@ class TestFolderMirroring: ).scalar_one_or_none() assert daily_after is not None + @pytest.mark.usefixtures( + "patched_self_hosted", + "patched_embed_for_indexer", + "patched_chunks_for_indexer", + "patched_summary_for_indexer", + ) + async def test_f6_single_file_creates_subfolder( + self, + db_session: AsyncSession, + db_user: User, + db_search_space: SearchSpace, + tmp_path: Path, + ): + """F6: Single-file mode creates missing Folder rows and assigns correct folder_id.""" + from app.tasks.connector_indexers.local_folder_indexer import index_local_folder + + (tmp_path / "root.md").write_text("root") + + _, _, root_folder_id, _ = await index_local_folder( + session=db_session, + search_space_id=db_search_space.id, + user_id=str(db_user.id), + folder_path=str(tmp_path), + folder_name="test-folder", + ) + + sub = tmp_path / "notes" / "daily" + sub.mkdir(parents=True) + (sub / "new.md").write_text("new note in subfolder") + + count, _, _, _ = await index_local_folder( + session=db_session, + search_space_id=db_search_space.id, + user_id=str(db_user.id), + folder_path=str(tmp_path), + folder_name="test-folder", + target_file_path=str(sub / "new.md"), + root_folder_id=root_folder_id, + ) + assert count == 1 + + doc = ( + await db_session.execute( + select(Document).where( + Document.document_type == DocumentType.LOCAL_FOLDER_FILE, + Document.title == "new.md", + ) + ) + ).scalar_one() + + daily_folder = ( + await db_session.execute( + select(Folder).where(Folder.name == "daily") + ) + ).scalar_one() + + assert doc.folder_id == daily_folder.id + assert daily_folder.parent_id is not None + + notes_folder = ( + await db_session.execute( + select(Folder).where(Folder.name == "notes") + ) + ).scalar_one() + assert daily_folder.parent_id == notes_folder.id + assert notes_folder.parent_id == root_folder_id + + @pytest.mark.usefixtures( + "patched_self_hosted", + "patched_embed_for_indexer", + "patched_chunks_for_indexer", + "patched_summary_for_indexer", + ) + async def test_f7_single_file_delete_cleans_empty_folders( + self, + db_session: AsyncSession, + db_user: User, + db_search_space: SearchSpace, + tmp_path: Path, + ): + """F7: Deleting the only file in a subfolder via single-file mode removes empty Folder rows.""" + from app.tasks.connector_indexers.local_folder_indexer import index_local_folder + + sub = tmp_path / "notes" / "ephemeral" + sub.mkdir(parents=True) + (sub / "temp.md").write_text("temporary") + (tmp_path / "keep.md").write_text("keep this") + + _, _, root_folder_id, _ = await index_local_folder( + session=db_session, + search_space_id=db_search_space.id, + user_id=str(db_user.id), + folder_path=str(tmp_path), + folder_name="test-folder", + ) + + eph_folder = ( + await db_session.execute( + select(Folder).where(Folder.name == "ephemeral") + ) + ).scalar_one_or_none() + assert eph_folder is not None + + target = sub / "temp.md" + target.unlink() + + await index_local_folder( + session=db_session, + search_space_id=db_search_space.id, + user_id=str(db_user.id), + folder_path=str(tmp_path), + folder_name="test-folder", + target_file_path=str(target), + root_folder_id=root_folder_id, + ) + + eph_after = ( + await db_session.execute( + select(Folder).where(Folder.name == "ephemeral") + ) + ).scalar_one_or_none() + assert eph_after is None + + notes_after = ( + await db_session.execute( + select(Folder).where(Folder.name == "notes") + ) + ).scalar_one_or_none() + assert notes_after is None + # ==================================================================== # Tier 5: Pipeline Integration (P1) diff --git a/surfsense_web/hooks/use-folder-sync.ts b/surfsense_web/hooks/use-folder-sync.ts index fcfb2814e..f051b7df6 100644 --- a/surfsense_web/hooks/use-folder-sync.ts +++ b/surfsense_web/hooks/use-folder-sync.ts @@ -32,6 +32,7 @@ export function useFolderSync() { folder_name: event.folderName, search_space_id: event.searchSpaceId, target_file_path: event.fullPath, + root_folder_id: event.rootFolderId, }); } catch (err) { console.error("[FolderSync] Failed to trigger re-index:", err); diff --git a/surfsense_web/lib/apis/documents-api.service.ts b/surfsense_web/lib/apis/documents-api.service.ts index c77cd6848..a8e3831d4 100644 --- a/surfsense_web/lib/apis/documents-api.service.ts +++ b/surfsense_web/lib/apis/documents-api.service.ts @@ -399,7 +399,7 @@ class DocumentsApiService { return baseApiService.post(`/api/v1/documents/folder-index`, undefined, { body }); }; - folderIndexFile = async (searchSpaceId: number, body: { folder_path: string; folder_name: string; search_space_id: number; target_file_path: string; enable_summary?: boolean }) => { + folderIndexFile = async (searchSpaceId: number, body: { folder_path: string; folder_name: string; search_space_id: number; target_file_path: string; root_folder_id?: number | null; enable_summary?: boolean }) => { return baseApiService.post(`/api/v1/documents/folder-index-file`, undefined, { body }); };