mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-05-11 08:42:39 +02:00
feat: enhance folder indexing by adding root folder ID support and implement folder creation and cleanup logic
This commit is contained in:
parent
caf2525ab5
commit
c27d24a117
5 changed files with 236 additions and 1 deletions
|
|
@ -1280,6 +1280,7 @@ class FolderIndexFileRequest(PydanticBaseModel):
|
||||||
folder_name: str
|
folder_name: str
|
||||||
search_space_id: int
|
search_space_id: int
|
||||||
target_file_path: str
|
target_file_path: str
|
||||||
|
root_folder_id: int | None = None
|
||||||
enable_summary: bool = False
|
enable_summary: bool = False
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -1394,6 +1395,7 @@ async def folder_index_file(
|
||||||
folder_path=request.folder_path,
|
folder_path=request.folder_path,
|
||||||
folder_name=request.folder_name,
|
folder_name=request.folder_name,
|
||||||
target_file_path=request.target_file_path,
|
target_file_path=request.target_file_path,
|
||||||
|
root_folder_id=request.root_folder_id,
|
||||||
enable_summary=request.enable_summary,
|
enable_summary=request.enable_summary,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -312,6 +312,92 @@ async def _mirror_folder_structure(
|
||||||
return mapping, root_folder_id
|
return mapping, root_folder_id
|
||||||
|
|
||||||
|
|
||||||
|
async def _resolve_folder_for_file(
|
||||||
|
session: AsyncSession,
|
||||||
|
rel_path: str,
|
||||||
|
root_folder_id: int,
|
||||||
|
search_space_id: int,
|
||||||
|
user_id: str,
|
||||||
|
) -> int:
|
||||||
|
"""Given a file's relative path, ensure all parent Folder rows exist and
|
||||||
|
return the folder_id for the file's immediate parent directory.
|
||||||
|
|
||||||
|
For a file at "notes/daily/today.md", this ensures Folder rows exist for
|
||||||
|
"notes" and "notes/daily", and returns the id of "notes/daily".
|
||||||
|
For a file at "readme.md" (root level), returns root_folder_id.
|
||||||
|
"""
|
||||||
|
parent_dir = str(Path(rel_path).parent)
|
||||||
|
if parent_dir == ".":
|
||||||
|
return root_folder_id
|
||||||
|
|
||||||
|
parts = Path(parent_dir).parts
|
||||||
|
current_parent_id = root_folder_id
|
||||||
|
|
||||||
|
for part in parts:
|
||||||
|
existing = (
|
||||||
|
await session.execute(
|
||||||
|
select(Folder).where(
|
||||||
|
Folder.name == part,
|
||||||
|
Folder.parent_id == current_parent_id,
|
||||||
|
Folder.search_space_id == search_space_id,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
).scalar_one_or_none()
|
||||||
|
|
||||||
|
if existing:
|
||||||
|
current_parent_id = existing.id
|
||||||
|
else:
|
||||||
|
new_folder = Folder(
|
||||||
|
name=part,
|
||||||
|
parent_id=current_parent_id,
|
||||||
|
search_space_id=search_space_id,
|
||||||
|
created_by_id=user_id,
|
||||||
|
position="a0",
|
||||||
|
)
|
||||||
|
session.add(new_folder)
|
||||||
|
await session.flush()
|
||||||
|
current_parent_id = new_folder.id
|
||||||
|
|
||||||
|
return current_parent_id
|
||||||
|
|
||||||
|
|
||||||
|
async def _cleanup_empty_folder_chain(
|
||||||
|
session: AsyncSession,
|
||||||
|
folder_id: int,
|
||||||
|
root_folder_id: int,
|
||||||
|
) -> None:
|
||||||
|
"""Walk up from folder_id toward root, deleting empty folders (no docs, no
|
||||||
|
children). Stops at root_folder_id which is never deleted."""
|
||||||
|
current_id = folder_id
|
||||||
|
while current_id and current_id != root_folder_id:
|
||||||
|
has_doc = (
|
||||||
|
await session.execute(
|
||||||
|
select(Document.id).where(Document.folder_id == current_id).limit(1)
|
||||||
|
)
|
||||||
|
).scalar_one_or_none()
|
||||||
|
if has_doc is not None:
|
||||||
|
break
|
||||||
|
|
||||||
|
has_child = (
|
||||||
|
await session.execute(
|
||||||
|
select(Folder.id).where(Folder.parent_id == current_id).limit(1)
|
||||||
|
)
|
||||||
|
).scalar_one_or_none()
|
||||||
|
if has_child is not None:
|
||||||
|
break
|
||||||
|
|
||||||
|
folder = (
|
||||||
|
await session.execute(select(Folder).where(Folder.id == current_id))
|
||||||
|
).scalar_one_or_none()
|
||||||
|
if not folder:
|
||||||
|
break
|
||||||
|
|
||||||
|
parent_id = folder.parent_id
|
||||||
|
await session.delete(folder)
|
||||||
|
await session.flush()
|
||||||
|
current_id = parent_id
|
||||||
|
|
||||||
|
|
||||||
async def _cleanup_empty_folders(
|
async def _cleanup_empty_folders(
|
||||||
session: AsyncSession,
|
session: AsyncSession,
|
||||||
root_folder_id: int,
|
root_folder_id: int,
|
||||||
|
|
@ -427,6 +513,7 @@ async def index_local_folder(
|
||||||
folder_name=folder_name,
|
folder_name=folder_name,
|
||||||
target_file_path=target_file_path,
|
target_file_path=target_file_path,
|
||||||
enable_summary=enable_summary,
|
enable_summary=enable_summary,
|
||||||
|
root_folder_id=root_folder_id,
|
||||||
task_logger=task_logger,
|
task_logger=task_logger,
|
||||||
log_entry=log_entry,
|
log_entry=log_entry,
|
||||||
)
|
)
|
||||||
|
|
@ -802,6 +889,7 @@ async def _index_single_file(
|
||||||
folder_name: str,
|
folder_name: str,
|
||||||
target_file_path: str,
|
target_file_path: str,
|
||||||
enable_summary: bool,
|
enable_summary: bool,
|
||||||
|
root_folder_id: int | None,
|
||||||
task_logger,
|
task_logger,
|
||||||
log_entry,
|
log_entry,
|
||||||
) -> tuple[int, int, str | None]:
|
) -> tuple[int, int, str | None]:
|
||||||
|
|
@ -816,7 +904,13 @@ async def _index_single_file(
|
||||||
)
|
)
|
||||||
existing = await check_document_by_unique_identifier(session, uid_hash)
|
existing = await check_document_by_unique_identifier(session, uid_hash)
|
||||||
if existing:
|
if existing:
|
||||||
|
deleted_folder_id = existing.folder_id
|
||||||
await session.delete(existing)
|
await session.delete(existing)
|
||||||
|
await session.flush()
|
||||||
|
if deleted_folder_id and root_folder_id:
|
||||||
|
await _cleanup_empty_folder_chain(
|
||||||
|
session, deleted_folder_id, root_folder_id
|
||||||
|
)
|
||||||
await session.commit()
|
await session.commit()
|
||||||
return 0, 0, None
|
return 0, 0, None
|
||||||
return 0, 0, None
|
return 0, 0, None
|
||||||
|
|
@ -880,6 +974,12 @@ async def _index_single_file(
|
||||||
"mtime": mtime,
|
"mtime": mtime,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
folder_id = None
|
||||||
|
if root_folder_id:
|
||||||
|
folder_id = await _resolve_folder_for_file(
|
||||||
|
session, rel_path, root_folder_id, search_space_id, user_id
|
||||||
|
)
|
||||||
|
|
||||||
if existing:
|
if existing:
|
||||||
existing.title = title
|
existing.title = title
|
||||||
existing.content = document_string
|
existing.content = document_string
|
||||||
|
|
@ -887,6 +987,7 @@ async def _index_single_file(
|
||||||
existing.source_markdown = content
|
existing.source_markdown = content
|
||||||
existing.embedding = embedding
|
existing.embedding = embedding
|
||||||
existing.document_metadata = doc_metadata
|
existing.document_metadata = doc_metadata
|
||||||
|
existing.folder_id = folder_id
|
||||||
await safe_set_chunks(session, existing, chunks)
|
await safe_set_chunks(session, existing, chunks)
|
||||||
existing.updated_at = get_current_timestamp()
|
existing.updated_at = get_current_timestamp()
|
||||||
existing.status = DocumentStatus.ready()
|
existing.status = DocumentStatus.ready()
|
||||||
|
|
@ -905,6 +1006,7 @@ async def _index_single_file(
|
||||||
updated_at=get_current_timestamp(),
|
updated_at=get_current_timestamp(),
|
||||||
created_by_id=user_id,
|
created_by_id=user_id,
|
||||||
connector_id=None,
|
connector_id=None,
|
||||||
|
folder_id=folder_id,
|
||||||
)
|
)
|
||||||
session.add(document)
|
session.add(document)
|
||||||
await session.flush()
|
await session.flush()
|
||||||
|
|
|
||||||
|
|
@ -567,6 +567,136 @@ class TestFolderMirroring:
|
||||||
).scalar_one_or_none()
|
).scalar_one_or_none()
|
||||||
assert daily_after is not None
|
assert daily_after is not None
|
||||||
|
|
||||||
|
@pytest.mark.usefixtures(
|
||||||
|
"patched_self_hosted",
|
||||||
|
"patched_embed_for_indexer",
|
||||||
|
"patched_chunks_for_indexer",
|
||||||
|
"patched_summary_for_indexer",
|
||||||
|
)
|
||||||
|
async def test_f6_single_file_creates_subfolder(
|
||||||
|
self,
|
||||||
|
db_session: AsyncSession,
|
||||||
|
db_user: User,
|
||||||
|
db_search_space: SearchSpace,
|
||||||
|
tmp_path: Path,
|
||||||
|
):
|
||||||
|
"""F6: Single-file mode creates missing Folder rows and assigns correct folder_id."""
|
||||||
|
from app.tasks.connector_indexers.local_folder_indexer import index_local_folder
|
||||||
|
|
||||||
|
(tmp_path / "root.md").write_text("root")
|
||||||
|
|
||||||
|
_, _, root_folder_id, _ = await index_local_folder(
|
||||||
|
session=db_session,
|
||||||
|
search_space_id=db_search_space.id,
|
||||||
|
user_id=str(db_user.id),
|
||||||
|
folder_path=str(tmp_path),
|
||||||
|
folder_name="test-folder",
|
||||||
|
)
|
||||||
|
|
||||||
|
sub = tmp_path / "notes" / "daily"
|
||||||
|
sub.mkdir(parents=True)
|
||||||
|
(sub / "new.md").write_text("new note in subfolder")
|
||||||
|
|
||||||
|
count, _, _, _ = await index_local_folder(
|
||||||
|
session=db_session,
|
||||||
|
search_space_id=db_search_space.id,
|
||||||
|
user_id=str(db_user.id),
|
||||||
|
folder_path=str(tmp_path),
|
||||||
|
folder_name="test-folder",
|
||||||
|
target_file_path=str(sub / "new.md"),
|
||||||
|
root_folder_id=root_folder_id,
|
||||||
|
)
|
||||||
|
assert count == 1
|
||||||
|
|
||||||
|
doc = (
|
||||||
|
await db_session.execute(
|
||||||
|
select(Document).where(
|
||||||
|
Document.document_type == DocumentType.LOCAL_FOLDER_FILE,
|
||||||
|
Document.title == "new.md",
|
||||||
|
)
|
||||||
|
)
|
||||||
|
).scalar_one()
|
||||||
|
|
||||||
|
daily_folder = (
|
||||||
|
await db_session.execute(
|
||||||
|
select(Folder).where(Folder.name == "daily")
|
||||||
|
)
|
||||||
|
).scalar_one()
|
||||||
|
|
||||||
|
assert doc.folder_id == daily_folder.id
|
||||||
|
assert daily_folder.parent_id is not None
|
||||||
|
|
||||||
|
notes_folder = (
|
||||||
|
await db_session.execute(
|
||||||
|
select(Folder).where(Folder.name == "notes")
|
||||||
|
)
|
||||||
|
).scalar_one()
|
||||||
|
assert daily_folder.parent_id == notes_folder.id
|
||||||
|
assert notes_folder.parent_id == root_folder_id
|
||||||
|
|
||||||
|
@pytest.mark.usefixtures(
|
||||||
|
"patched_self_hosted",
|
||||||
|
"patched_embed_for_indexer",
|
||||||
|
"patched_chunks_for_indexer",
|
||||||
|
"patched_summary_for_indexer",
|
||||||
|
)
|
||||||
|
async def test_f7_single_file_delete_cleans_empty_folders(
|
||||||
|
self,
|
||||||
|
db_session: AsyncSession,
|
||||||
|
db_user: User,
|
||||||
|
db_search_space: SearchSpace,
|
||||||
|
tmp_path: Path,
|
||||||
|
):
|
||||||
|
"""F7: Deleting the only file in a subfolder via single-file mode removes empty Folder rows."""
|
||||||
|
from app.tasks.connector_indexers.local_folder_indexer import index_local_folder
|
||||||
|
|
||||||
|
sub = tmp_path / "notes" / "ephemeral"
|
||||||
|
sub.mkdir(parents=True)
|
||||||
|
(sub / "temp.md").write_text("temporary")
|
||||||
|
(tmp_path / "keep.md").write_text("keep this")
|
||||||
|
|
||||||
|
_, _, root_folder_id, _ = await index_local_folder(
|
||||||
|
session=db_session,
|
||||||
|
search_space_id=db_search_space.id,
|
||||||
|
user_id=str(db_user.id),
|
||||||
|
folder_path=str(tmp_path),
|
||||||
|
folder_name="test-folder",
|
||||||
|
)
|
||||||
|
|
||||||
|
eph_folder = (
|
||||||
|
await db_session.execute(
|
||||||
|
select(Folder).where(Folder.name == "ephemeral")
|
||||||
|
)
|
||||||
|
).scalar_one_or_none()
|
||||||
|
assert eph_folder is not None
|
||||||
|
|
||||||
|
target = sub / "temp.md"
|
||||||
|
target.unlink()
|
||||||
|
|
||||||
|
await index_local_folder(
|
||||||
|
session=db_session,
|
||||||
|
search_space_id=db_search_space.id,
|
||||||
|
user_id=str(db_user.id),
|
||||||
|
folder_path=str(tmp_path),
|
||||||
|
folder_name="test-folder",
|
||||||
|
target_file_path=str(target),
|
||||||
|
root_folder_id=root_folder_id,
|
||||||
|
)
|
||||||
|
|
||||||
|
eph_after = (
|
||||||
|
await db_session.execute(
|
||||||
|
select(Folder).where(Folder.name == "ephemeral")
|
||||||
|
)
|
||||||
|
).scalar_one_or_none()
|
||||||
|
assert eph_after is None
|
||||||
|
|
||||||
|
notes_after = (
|
||||||
|
await db_session.execute(
|
||||||
|
select(Folder).where(Folder.name == "notes")
|
||||||
|
)
|
||||||
|
).scalar_one_or_none()
|
||||||
|
assert notes_after is None
|
||||||
|
|
||||||
|
|
||||||
# ====================================================================
|
# ====================================================================
|
||||||
# Tier 5: Pipeline Integration (P1)
|
# Tier 5: Pipeline Integration (P1)
|
||||||
|
|
|
||||||
|
|
@ -32,6 +32,7 @@ export function useFolderSync() {
|
||||||
folder_name: event.folderName,
|
folder_name: event.folderName,
|
||||||
search_space_id: event.searchSpaceId,
|
search_space_id: event.searchSpaceId,
|
||||||
target_file_path: event.fullPath,
|
target_file_path: event.fullPath,
|
||||||
|
root_folder_id: event.rootFolderId,
|
||||||
});
|
});
|
||||||
} catch (err) {
|
} catch (err) {
|
||||||
console.error("[FolderSync] Failed to trigger re-index:", err);
|
console.error("[FolderSync] Failed to trigger re-index:", err);
|
||||||
|
|
|
||||||
|
|
@ -399,7 +399,7 @@ class DocumentsApiService {
|
||||||
return baseApiService.post(`/api/v1/documents/folder-index`, undefined, { body });
|
return baseApiService.post(`/api/v1/documents/folder-index`, undefined, { body });
|
||||||
};
|
};
|
||||||
|
|
||||||
folderIndexFile = async (searchSpaceId: number, body: { folder_path: string; folder_name: string; search_space_id: number; target_file_path: string; enable_summary?: boolean }) => {
|
folderIndexFile = async (searchSpaceId: number, body: { folder_path: string; folder_name: string; search_space_id: number; target_file_path: string; root_folder_id?: number | null; enable_summary?: boolean }) => {
|
||||||
return baseApiService.post(`/api/v1/documents/folder-index-file`, undefined, { body });
|
return baseApiService.post(`/api/v1/documents/folder-index-file`, undefined, { body });
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue