No documents found
Use the upload button or connect a source above
From 60eb1e406062cff88a0abd215d53cdea085a4e30 Mon Sep 17 00:00:00 2001
From: Anish Sarkar <104695310+AnishSarkar22@users.noreply.github.com>
Date: Wed, 8 Apr 2026 16:28:51 +0530
Subject: [PATCH 11/18] feat: implement raw file hash computation to optimize
content extraction during local folder indexing
---
.../local_folder_indexer.py | 94 +++++++++++++++++--
1 file changed, 88 insertions(+), 6 deletions(-)
diff --git a/surfsense_backend/app/tasks/connector_indexers/local_folder_indexer.py b/surfsense_backend/app/tasks/connector_indexers/local_folder_indexer.py
index 7b433cf62..5c4878a04 100644
--- a/surfsense_backend/app/tasks/connector_indexers/local_folder_indexer.py
+++ b/surfsense_backend/app/tasks/connector_indexers/local_folder_indexer.py
@@ -178,6 +178,22 @@ def _content_hash(content: str, search_space_id: int) -> str:
return hashlib.sha256(f"{search_space_id}:{content}".encode()).hexdigest()
+def _compute_raw_file_hash(file_path: str) -> str:
+ """SHA-256 hash of the raw file bytes.
+
+ Much cheaper than ETL/OCR extraction -- only performs sequential I/O.
+ Used as a pre-filter to skip expensive content extraction when the
+ underlying file hasn't changed at all.
+ """
+ import hashlib
+
+ h = hashlib.sha256()
+ with open(file_path, "rb") as f:
+ for chunk in iter(lambda: f.read(8192), b""):
+ h.update(chunk)
+ return h.hexdigest()
+
+
async def _compute_file_content_hash(
file_path: str,
filename: str,
@@ -630,6 +646,24 @@ async def index_local_folder(
skipped_count += 1
continue
+ raw_hash = await asyncio.to_thread(
+ _compute_raw_file_hash, file_path_abs
+ )
+
+ stored_raw_hash = (
+ existing_document.document_metadata or {}
+ ).get("raw_file_hash")
+ if stored_raw_hash and stored_raw_hash == raw_hash:
+ meta = dict(existing_document.document_metadata or {})
+ meta["mtime"] = current_mtime
+ existing_document.document_metadata = meta
+ if not DocumentStatus.is_state(
+ existing_document.status, DocumentStatus.READY
+ ):
+ existing_document.status = DocumentStatus.ready()
+ skipped_count += 1
+ continue
+
try:
estimated_pages = await _check_page_limit_or_skip(
page_limit_service, user_id, file_path_abs
@@ -653,6 +687,7 @@ async def index_local_folder(
if existing_document.content_hash == content_hash:
meta = dict(existing_document.document_metadata or {})
meta["mtime"] = current_mtime
+ meta["raw_file_hash"] = raw_hash
existing_document.document_metadata = meta
if not DocumentStatus.is_state(
existing_document.status, DocumentStatus.READY
@@ -687,6 +722,10 @@ async def index_local_folder(
skipped_count += 1
continue
+ raw_hash = await asyncio.to_thread(
+ _compute_raw_file_hash, file_path_abs
+ )
+
doc = _build_connector_doc(
title=file_info["name"],
content=content,
@@ -702,6 +741,7 @@ async def index_local_folder(
"mtime": file_info["modified_at"].timestamp(),
"estimated_pages": estimated_pages,
"content_length": len(content),
+ "raw_file_hash": raw_hash,
}
except Exception as e:
@@ -795,6 +835,7 @@ async def index_local_folder(
doc_meta = dict(result.document_metadata or {})
doc_meta["mtime"] = mtime_info.get("mtime")
+ doc_meta["raw_file_hash"] = mtime_info.get("raw_file_hash")
result.document_metadata = doc_meta
est = mtime_info.get("estimated_pages", 1)
@@ -988,6 +1029,26 @@ async def _index_single_file(
DocumentType.LOCAL_FOLDER_FILE.value, unique_id, search_space_id
)
+ raw_hash = await asyncio.to_thread(_compute_raw_file_hash, str(full_path))
+
+ existing = await check_document_by_unique_identifier(session, uid_hash)
+
+ if existing:
+ stored_raw_hash = (existing.document_metadata or {}).get(
+ "raw_file_hash"
+ )
+ if stored_raw_hash and stored_raw_hash == raw_hash:
+ mtime = full_path.stat().st_mtime
+ meta = dict(existing.document_metadata or {})
+ meta["mtime"] = mtime
+ existing.document_metadata = meta
+ if not DocumentStatus.is_state(
+ existing.status, DocumentStatus.READY
+ ):
+ existing.status = DocumentStatus.ready()
+ await session.commit()
+ return 0, 0, None
+
page_limit_service = PageLimitService(session)
try:
estimated_pages = await _check_page_limit_or_skip(
@@ -1006,13 +1067,12 @@ async def _index_single_file(
if not content.strip():
return 0, 1, None
- existing = await check_document_by_unique_identifier(session, uid_hash)
-
if existing:
if existing.content_hash == content_hash:
mtime = full_path.stat().st_mtime
meta = dict(existing.document_metadata or {})
meta["mtime"] = mtime
+ meta["raw_file_hash"] = raw_hash
existing.document_metadata = meta
await session.commit()
return 0, 1, None
@@ -1055,6 +1115,7 @@ async def _index_single_file(
await session.refresh(db_doc)
doc_meta = dict(db_doc.document_metadata or {})
doc_meta["mtime"] = mtime
+ doc_meta["raw_file_hash"] = raw_hash
db_doc.document_metadata = doc_meta
await session.commit()
@@ -1236,6 +1297,29 @@ async def index_uploaded_files(
search_space_id,
)
+ raw_hash = await asyncio.to_thread(
+ _compute_raw_file_hash, temp_path
+ )
+
+ existing = await check_document_by_unique_identifier(
+ session, uid_hash
+ )
+
+ if existing:
+ stored_raw_hash = (existing.document_metadata or {}).get(
+ "raw_file_hash"
+ )
+ if stored_raw_hash and stored_raw_hash == raw_hash:
+ meta = dict(existing.document_metadata or {})
+ meta["mtime"] = datetime.now(UTC).timestamp()
+ existing.document_metadata = meta
+ if not DocumentStatus.is_state(
+ existing.status, DocumentStatus.READY
+ ):
+ existing.status = DocumentStatus.ready()
+ await session.commit()
+ continue
+
try:
estimated_pages = await _check_page_limit_or_skip(
page_limit_service, user_id, temp_path
@@ -1259,14 +1343,11 @@ async def index_uploaded_files(
failed_count += 1
continue
- existing = await check_document_by_unique_identifier(
- session, uid_hash
- )
-
if existing:
if existing.content_hash == content_hash:
meta = dict(existing.document_metadata or {})
meta["mtime"] = datetime.now(UTC).timestamp()
+ meta["raw_file_hash"] = raw_hash
existing.document_metadata = meta
if not DocumentStatus.is_state(
existing.status, DocumentStatus.READY
@@ -1312,6 +1393,7 @@ async def index_uploaded_files(
await session.refresh(db_doc)
doc_meta = dict(db_doc.document_metadata or {})
doc_meta["mtime"] = datetime.now(UTC).timestamp()
+ doc_meta["raw_file_hash"] = raw_hash
db_doc.document_metadata = doc_meta
await session.commit()
From ae98f647608c63455f0b5d1c6a1696187f1ed08a Mon Sep 17 00:00:00 2001
From: Anish Sarkar <104695310+AnishSarkar22@users.noreply.github.com>
Date: Wed, 8 Apr 2026 16:48:40 +0530
Subject: [PATCH 12/18] feat: enhance folder indexing with metadata management
and improve folder structure handling in UI components
---
.../connector_indexers/local_folder_indexer.py | 18 ++++++++++++++++++
.../components/documents/FolderNode.tsx | 1 +
.../components/documents/FolderTreeView.tsx | 13 ++++++++++++-
.../layout/ui/sidebar/DocumentsSidebar.tsx | 1 +
.../components/sources/FolderWatchDialog.tsx | 2 +-
surfsense_web/zero/schema/folders.ts | 3 ++-
6 files changed, 35 insertions(+), 3 deletions(-)
diff --git a/surfsense_backend/app/tasks/connector_indexers/local_folder_indexer.py b/surfsense_backend/app/tasks/connector_indexers/local_folder_indexer.py
index 5c4878a04..c23fe55c3 100644
--- a/surfsense_backend/app/tasks/connector_indexers/local_folder_indexer.py
+++ b/surfsense_backend/app/tasks/connector_indexers/local_folder_indexer.py
@@ -1276,6 +1276,13 @@ async def index_uploaded_files(
)
await session.flush()
+ root_folder = await session.get(Folder, root_folder_id)
+ if root_folder:
+ meta = dict(root_folder.folder_metadata or {})
+ meta["indexing_in_progress"] = True
+ root_folder.folder_metadata = meta
+ await session.commit()
+
page_limit_service = PageLimitService(session)
pipeline = IndexingPipelineService(session)
llm = await get_user_long_context_llm(session, user_id, search_space_id)
@@ -1454,3 +1461,14 @@ async def index_uploaded_files(
log_entry, f"Error: {e}", "Unexpected error", {}
)
return 0, 0, str(e)
+
+ finally:
+ try:
+ root_folder = await session.get(Folder, root_folder_id)
+ if root_folder:
+ meta = dict(root_folder.folder_metadata or {})
+ meta.pop("indexing_in_progress", None)
+ root_folder.folder_metadata = meta
+ await session.commit()
+ except Exception:
+ pass
diff --git a/surfsense_web/components/documents/FolderNode.tsx b/surfsense_web/components/documents/FolderNode.tsx
index 4b0103ce5..7f75f8abf 100644
--- a/surfsense_web/components/documents/FolderNode.tsx
+++ b/surfsense_web/components/documents/FolderNode.tsx
@@ -49,6 +49,7 @@ export interface FolderDisplay {
position: string;
parentId: number | null;
searchSpaceId: number;
+ metadata?: Record | null;
}
interface FolderNodeProps {
diff --git a/surfsense_web/components/documents/FolderTreeView.tsx b/surfsense_web/components/documents/FolderTreeView.tsx
index 65f7fd9a2..6eb53da50 100644
--- a/surfsense_web/components/documents/FolderTreeView.tsx
+++ b/surfsense_web/components/documents/FolderTreeView.tsx
@@ -168,6 +168,12 @@ export function FolderTreeView({
return states;
}, [folders, docsByFolder, foldersByParent, mentionedDocIds]);
+ const folderMap = useMemo(() => {
+ const map: Record = {};
+ for (const f of folders) map[f.id] = f;
+ return map;
+ }, [folders]);
+
const folderProcessingStates = useMemo(() => {
const states: Record = {};
@@ -178,6 +184,11 @@ export function FolderTreeView({
);
let hasFailed = directDocs.some((d) => d.status?.state === "failed");
+ const folder = folderMap[folderId];
+ if (folder?.metadata?.indexing_in_progress) {
+ hasProcessing = true;
+ }
+
for (const child of foldersByParent[folderId] ?? []) {
const sub = compute(child.id);
hasProcessing = hasProcessing || sub.hasProcessing;
@@ -195,7 +206,7 @@ export function FolderTreeView({
if (states[f.id] === undefined) compute(f.id);
}
return states;
- }, [folders, docsByFolder, foldersByParent]);
+ }, [folders, docsByFolder, foldersByParent, folderMap]);
function renderLevel(parentId: number | null, depth: number): React.ReactNode[] {
const key = parentId ?? "root";
diff --git a/surfsense_web/components/layout/ui/sidebar/DocumentsSidebar.tsx b/surfsense_web/components/layout/ui/sidebar/DocumentsSidebar.tsx
index b8634bb3b..7679faae5 100644
--- a/surfsense_web/components/layout/ui/sidebar/DocumentsSidebar.tsx
+++ b/surfsense_web/components/layout/ui/sidebar/DocumentsSidebar.tsx
@@ -194,6 +194,7 @@ export function DocumentsSidebar({
position: f.position,
parentId: f.parentId ?? null,
searchSpaceId: f.searchSpaceId,
+ metadata: f.metadata as Record | null | undefined,
})),
[zeroFolders]
);
diff --git a/surfsense_web/components/sources/FolderWatchDialog.tsx b/surfsense_web/components/sources/FolderWatchDialog.tsx
index b44f42d77..bb3972a09 100644
--- a/surfsense_web/components/sources/FolderWatchDialog.tsx
+++ b/surfsense_web/components/sources/FolderWatchDialog.tsx
@@ -66,7 +66,7 @@ export function FolderWatchDialog({
const folderPath = await api.selectFolder();
if (!folderPath) return;
- const folderName = folderPath.split("/").pop() || folderPath.split("\\").pop() || folderPath;
+ const folderName = folderPath.split(/[/\\]/).pop() || folderPath;
setSelectedFolder({ path: folderPath, name: folderName });
}, []);
diff --git a/surfsense_web/zero/schema/folders.ts b/surfsense_web/zero/schema/folders.ts
index 2313506ab..c5b192942 100644
--- a/surfsense_web/zero/schema/folders.ts
+++ b/surfsense_web/zero/schema/folders.ts
@@ -1,4 +1,4 @@
-import { number, string, table } from "@rocicorp/zero";
+import { json, number, string, table } from "@rocicorp/zero";
export const folderTable = table("folders")
.columns({
@@ -10,5 +10,6 @@ export const folderTable = table("folders")
createdById: string().optional().from("created_by_id"),
createdAt: number().from("created_at"),
updatedAt: number().from("updated_at"),
+ metadata: json>().optional().from("metadata"),
})
.primaryKey("id");
From cab0d1bdfee27a3d2e9586cc9b97a525c8e8e20b Mon Sep 17 00:00:00 2001
From: Anish Sarkar <104695310+AnishSarkar22@users.noreply.github.com>
Date: Wed, 8 Apr 2026 17:10:22 +0530
Subject: [PATCH 13/18] feat: enhance folder synchronization by integrating
subtree ID retrieval and optimizing empty folder cleanup process
---
.../app/routes/documents_routes.py | 57 +++++++------------
.../local_folder_indexer.py | 19 +++----
2 files changed, 29 insertions(+), 47 deletions(-)
diff --git a/surfsense_backend/app/routes/documents_routes.py b/surfsense_backend/app/routes/documents_routes.py
index 39458dc5f..e71cef7e4 100644
--- a/surfsense_backend/app/routes/documents_routes.py
+++ b/surfsense_backend/app/routes/documents_routes.py
@@ -1675,6 +1675,7 @@ async def folder_sync_finalize(
gets deleted.
"""
from app.indexing_pipeline.document_hashing import compute_identifier_hash
+ from app.services.folder_service import get_folder_subtree_ids
from app.tasks.connector_indexers.local_folder_indexer import (
_cleanup_empty_folders,
)
@@ -1687,6 +1688,11 @@ async def folder_sync_finalize(
"You don't have permission to delete documents in this search space",
)
+ if not request.root_folder_id:
+ return {"deleted_count": 0}
+
+ subtree_ids = await get_folder_subtree_ids(session, request.root_folder_id)
+
seen_hashes: set[str] = set()
for rel_path in request.all_relative_paths:
unique_id = f"{request.folder_name}:{rel_path}"
@@ -1697,32 +1703,13 @@ async def folder_sync_finalize(
)
seen_hashes.add(uid_hash)
- all_root_folder_ids: set[int] = set()
- if request.root_folder_id:
- all_root_folder_ids.add(request.root_folder_id)
-
- all_db_folders = (
- (
- await session.execute(
- select(Folder.id).where(
- Folder.search_space_id == request.search_space_id,
- )
- )
- )
- .scalars()
- .all()
- )
- all_root_folder_ids.update(all_db_folders)
-
all_folder_docs = (
(
await session.execute(
select(Document).where(
Document.document_type == DocumentType.LOCAL_FOLDER_FILE,
Document.search_space_id == request.search_space_id,
- Document.folder_id.in_(list(all_root_folder_ids))
- if all_root_folder_ids
- else True,
+ Document.folder_id.in_(subtree_ids),
)
)
)
@@ -1738,24 +1725,22 @@ async def folder_sync_finalize(
await session.flush()
- if request.root_folder_id:
- existing_dirs: set[str] = set()
- for rel_path in request.all_relative_paths:
- parent = str(os.path.dirname(rel_path))
- if parent and parent != ".":
- existing_dirs.add(parent)
+ existing_dirs: set[str] = set()
+ for rel_path in request.all_relative_paths:
+ parent = str(os.path.dirname(rel_path))
+ if parent and parent != ".":
+ existing_dirs.add(parent)
- folder_mapping: dict[str, int] = {}
- if request.root_folder_id:
- folder_mapping[""] = request.root_folder_id
+ folder_mapping: dict[str, int] = {"": request.root_folder_id}
- await _cleanup_empty_folders(
- session,
- request.root_folder_id,
- request.search_space_id,
- existing_dirs,
- folder_mapping,
- )
+ await _cleanup_empty_folders(
+ session,
+ request.root_folder_id,
+ request.search_space_id,
+ existing_dirs,
+ folder_mapping,
+ subtree_ids=subtree_ids,
+ )
await session.commit()
return {"deleted_count": deleted_count}
diff --git a/surfsense_backend/app/tasks/connector_indexers/local_folder_indexer.py b/surfsense_backend/app/tasks/connector_indexers/local_folder_indexer.py
index c23fe55c3..5cac13b1b 100644
--- a/surfsense_backend/app/tasks/connector_indexers/local_folder_indexer.py
+++ b/surfsense_backend/app/tasks/connector_indexers/local_folder_indexer.py
@@ -387,24 +387,21 @@ async def _cleanup_empty_folders(
search_space_id: int,
existing_dirs_on_disk: set[str],
folder_mapping: dict[str, int],
+ subtree_ids: list[int] | None = None,
) -> None:
"""Delete Folder rows that are empty (no docs, no children) and no longer on disk."""
from sqlalchemy import delete as sa_delete
id_to_rel: dict[int, str] = {fid: rel for rel, fid in folder_mapping.items() if rel}
- all_folders = (
- (
- await session.execute(
- select(Folder).where(
- Folder.search_space_id == search_space_id,
- Folder.id != root_folder_id,
- )
- )
- )
- .scalars()
- .all()
+ query = select(Folder).where(
+ Folder.search_space_id == search_space_id,
+ Folder.id != root_folder_id,
)
+ if subtree_ids is not None:
+ query = query.where(Folder.id.in_(subtree_ids))
+
+ all_folders = (await session.execute(query)).scalars().all()
candidates: list[Folder] = []
for folder in all_folders:
From f3aa514240335bf9f0c3009cda3e2bfc095f6c69 Mon Sep 17 00:00:00 2001
From: Anish Sarkar <104695310+AnishSarkar22@users.noreply.github.com>
Date: Wed, 8 Apr 2026 17:25:18 +0530
Subject: [PATCH 14/18] feat: integrate subtree ID retrieval in local folder
cleanup process and enhance UI component styling for folder selection
---
.../app/tasks/connector_indexers/local_folder_indexer.py | 6 +++++-
surfsense_web/components/sources/FolderWatchDialog.tsx | 2 +-
2 files changed, 6 insertions(+), 2 deletions(-)
diff --git a/surfsense_backend/app/tasks/connector_indexers/local_folder_indexer.py b/surfsense_backend/app/tasks/connector_indexers/local_folder_indexer.py
index 5cac13b1b..1d890c8d3 100644
--- a/surfsense_backend/app/tasks/connector_indexers/local_folder_indexer.py
+++ b/surfsense_backend/app/tasks/connector_indexers/local_folder_indexer.py
@@ -861,8 +861,12 @@ async def index_local_folder(
root_fid = folder_mapping.get("")
if root_fid:
+ from app.services.folder_service import get_folder_subtree_ids
+
+ subtree_ids = await get_folder_subtree_ids(session, root_fid)
await _cleanup_empty_folders(
- session, root_fid, search_space_id, existing_dirs, folder_mapping
+ session, root_fid, search_space_id, existing_dirs, folder_mapping,
+ subtree_ids=subtree_ids,
)
try:
diff --git a/surfsense_web/components/sources/FolderWatchDialog.tsx b/surfsense_web/components/sources/FolderWatchDialog.tsx
index bb3972a09..f6814bcfb 100644
--- a/surfsense_web/components/sources/FolderWatchDialog.tsx
+++ b/surfsense_web/components/sources/FolderWatchDialog.tsx
@@ -170,7 +170,7 @@ export function FolderWatchDialog({
Select a folder to sync and watch for changes.
-
+
{selectedFolder ? (
From a8b83dcf3f52346fc9297d5f834e04969216fe4a Mon Sep 17 00:00:00 2001
From: Anish Sarkar <104695310+AnishSarkar22@users.noreply.github.com>
Date: Wed, 8 Apr 2026 17:48:50 +0530
Subject: [PATCH 15/18] feat: add folder_id support in ConnectorDocument and
indexing pipeline for improved document organization
---
.../indexing_pipeline/connector_document.py | 1 +
.../indexing_pipeline_service.py | 5 ++
.../local_folder_indexer.py | 59 ++++++-------------
3 files changed, 25 insertions(+), 40 deletions(-)
diff --git a/surfsense_backend/app/indexing_pipeline/connector_document.py b/surfsense_backend/app/indexing_pipeline/connector_document.py
index 019efe287..4f5d6e2e0 100644
--- a/surfsense_backend/app/indexing_pipeline/connector_document.py
+++ b/surfsense_backend/app/indexing_pipeline/connector_document.py
@@ -17,6 +17,7 @@ class ConnectorDocument(BaseModel):
metadata: dict = {}
connector_id: int | None = None
created_by_id: str
+ folder_id: int | None = None
@field_validator("title", "source_markdown", "unique_id", "created_by_id")
@classmethod
diff --git a/surfsense_backend/app/indexing_pipeline/indexing_pipeline_service.py b/surfsense_backend/app/indexing_pipeline/indexing_pipeline_service.py
index 0fa4006f5..22c552e5c 100644
--- a/surfsense_backend/app/indexing_pipeline/indexing_pipeline_service.py
+++ b/surfsense_backend/app/indexing_pipeline/indexing_pipeline_service.py
@@ -268,6 +268,8 @@ class IndexingPipelineService:
):
existing.status = DocumentStatus.pending()
existing.updated_at = datetime.now(UTC)
+ if connector_doc.folder_id is not None:
+ existing.folder_id = connector_doc.folder_id
documents.append(existing)
log_document_requeued(ctx)
continue
@@ -294,6 +296,8 @@ class IndexingPipelineService:
existing.document_metadata = connector_doc.metadata
existing.updated_at = datetime.now(UTC)
existing.status = DocumentStatus.pending()
+ if connector_doc.folder_id is not None:
+ existing.folder_id = connector_doc.folder_id
documents.append(existing)
log_document_updated(ctx)
continue
@@ -317,6 +321,7 @@ class IndexingPipelineService:
created_by_id=connector_doc.created_by_id,
updated_at=datetime.now(UTC),
status=DocumentStatus.pending(),
+ folder_id=connector_doc.folder_id,
)
self.session.add(document)
documents.append(document)
diff --git a/surfsense_backend/app/tasks/connector_indexers/local_folder_indexer.py b/surfsense_backend/app/tasks/connector_indexers/local_folder_indexer.py
index 1d890c8d3..3360cd343 100644
--- a/surfsense_backend/app/tasks/connector_indexers/local_folder_indexer.py
+++ b/surfsense_backend/app/tasks/connector_indexers/local_folder_indexer.py
@@ -790,29 +790,18 @@ async def index_local_folder(
compute_unique_identifier_hash,
)
- pipeline = IndexingPipelineService(session)
- doc_map = {compute_unique_identifier_hash(cd): cd for cd in connector_docs}
- documents = await pipeline.prepare_for_indexing(connector_docs)
-
- # Assign folder_id immediately so docs appear in the correct
- # folder while still pending/processing (visible via Zero sync).
- for document in documents:
- cd = doc_map.get(document.unique_identifier_hash)
- if cd is None:
- continue
+ for cd in connector_docs:
rel_path = (cd.metadata or {}).get("file_path", "")
parent_dir = str(Path(rel_path).parent) if rel_path else ""
if parent_dir == ".":
parent_dir = ""
- document.folder_id = folder_mapping.get(
+ cd.folder_id = folder_mapping.get(
parent_dir, folder_mapping.get("")
)
- try:
- await session.commit()
- except IntegrityError:
- await session.rollback()
- for document in documents:
- await session.refresh(document)
+
+ pipeline = IndexingPipelineService(session)
+ doc_map = {compute_unique_identifier_hash(cd): cd for cd in connector_docs}
+ documents = await pipeline.prepare_for_indexing(connector_docs)
llm = await get_user_long_context_llm(session, user_id, search_space_id)
@@ -1092,6 +1081,11 @@ async def _index_single_file(
enable_summary=enable_summary,
)
+ if root_folder_id:
+ connector_doc.folder_id = await _resolve_folder_for_file(
+ session, rel_path, root_folder_id, search_space_id, user_id
+ )
+
pipeline = IndexingPipelineService(session)
llm = await get_user_long_context_llm(session, user_id, search_space_id)
documents = await pipeline.prepare_for_indexing([connector_doc])
@@ -1101,16 +1095,6 @@ async def _index_single_file(
db_doc = documents[0]
- if root_folder_id:
- try:
- db_doc.folder_id = await _resolve_folder_for_file(
- session, rel_path, root_folder_id, search_space_id, user_id
- )
- await session.commit()
- except IntegrityError:
- await session.rollback()
- await session.refresh(db_doc)
-
await pipeline.index(db_doc, connector_doc, llm)
await session.refresh(db_doc)
@@ -1376,6 +1360,14 @@ async def index_uploaded_files(
enable_summary=enable_summary,
)
+ connector_doc.folder_id = await _resolve_folder_for_file(
+ session,
+ relative_path,
+ root_folder_id,
+ search_space_id,
+ user_id,
+ )
+
documents = await pipeline.prepare_for_indexing([connector_doc])
if not documents:
failed_count += 1
@@ -1383,19 +1375,6 @@ async def index_uploaded_files(
db_doc = documents[0]
- try:
- db_doc.folder_id = await _resolve_folder_for_file(
- session,
- relative_path,
- root_folder_id,
- search_space_id,
- user_id,
- )
- await session.commit()
- except IntegrityError:
- await session.rollback()
- await session.refresh(db_doc)
-
await pipeline.index(db_doc, connector_doc, llm)
await session.refresh(db_doc)
From 37c52ce7eaaf78dc9390d84a837fae1cdadc6bab Mon Sep 17 00:00:00 2001
From: Anish Sarkar <104695310+AnishSarkar22@users.noreply.github.com>
Date: Wed, 8 Apr 2026 18:01:55 +0530
Subject: [PATCH 16/18] feat: implement indexing progress management in local
folder indexing process and enhance related test coverage
---
.../local_folder_indexer.py | 113 +++++++++-------
.../test_local_folder_pipeline.py | 128 +++++++++++++++++-
.../assistant-ui/tooltip-icon-button.tsx | 5 +-
.../components/sources/FolderWatchDialog.tsx | 2 +-
4 files changed, 198 insertions(+), 50 deletions(-)
diff --git a/surfsense_backend/app/tasks/connector_indexers/local_folder_indexer.py b/surfsense_backend/app/tasks/connector_indexers/local_folder_indexer.py
index 3360cd343..8805558bd 100644
--- a/surfsense_backend/app/tasks/connector_indexers/local_folder_indexer.py
+++ b/surfsense_backend/app/tasks/connector_indexers/local_folder_indexer.py
@@ -344,6 +344,27 @@ async def _resolve_folder_for_file(
return current_parent_id
+async def _set_indexing_flag(session: AsyncSession, folder_id: int) -> None:
+ folder = await session.get(Folder, folder_id)
+ if folder:
+ meta = dict(folder.folder_metadata or {})
+ meta["indexing_in_progress"] = True
+ folder.folder_metadata = meta
+ await session.commit()
+
+
+async def _clear_indexing_flag(session: AsyncSession, folder_id: int) -> None:
+ try:
+ folder = await session.get(Folder, folder_id)
+ if folder:
+ meta = dict(folder.folder_metadata or {})
+ meta.pop("indexing_in_progress", None)
+ folder.folder_metadata = meta
+ await session.commit()
+ except Exception:
+ pass
+
+
async def _cleanup_empty_folder_chain(
session: AsyncSession,
folder_id: int,
@@ -531,44 +552,50 @@ async def index_local_folder(
# BATCH MODE (1..N files)
# ====================================================================
if target_file_paths:
- if len(target_file_paths) == 1:
- indexed, skipped, err = await _index_single_file(
- session=session,
+ if root_folder_id:
+ await _set_indexing_flag(session, root_folder_id)
+ try:
+ if len(target_file_paths) == 1:
+ indexed, skipped, err = await _index_single_file(
+ session=session,
+ search_space_id=search_space_id,
+ user_id=user_id,
+ folder_path=folder_path,
+ folder_name=folder_name,
+ target_file_path=target_file_paths[0],
+ enable_summary=enable_summary,
+ root_folder_id=root_folder_id,
+ task_logger=task_logger,
+ log_entry=log_entry,
+ )
+ return indexed, skipped, root_folder_id, err
+
+ indexed, failed, err = await _index_batch_files(
search_space_id=search_space_id,
user_id=user_id,
folder_path=folder_path,
folder_name=folder_name,
- target_file_path=target_file_paths[0],
+ target_file_paths=target_file_paths,
enable_summary=enable_summary,
root_folder_id=root_folder_id,
- task_logger=task_logger,
- log_entry=log_entry,
+ on_progress_callback=on_heartbeat_callback,
)
- return indexed, skipped, root_folder_id, err
-
- indexed, failed, err = await _index_batch_files(
- search_space_id=search_space_id,
- user_id=user_id,
- folder_path=folder_path,
- folder_name=folder_name,
- target_file_paths=target_file_paths,
- enable_summary=enable_summary,
- root_folder_id=root_folder_id,
- on_progress_callback=on_heartbeat_callback,
- )
- if err:
- await task_logger.log_task_success(
- log_entry,
- f"Batch indexing: {indexed} indexed, {failed} failed",
- {"indexed": indexed, "failed": failed},
- )
- else:
- await task_logger.log_task_success(
- log_entry,
- f"Batch indexing complete: {indexed} indexed",
- {"indexed": indexed, "failed": failed},
- )
- return indexed, failed, root_folder_id, err
+ if err:
+ await task_logger.log_task_success(
+ log_entry,
+ f"Batch indexing: {indexed} indexed, {failed} failed",
+ {"indexed": indexed, "failed": failed},
+ )
+ else:
+ await task_logger.log_task_success(
+ log_entry,
+ f"Batch indexing complete: {indexed} indexed",
+ {"indexed": indexed, "failed": failed},
+ )
+ return indexed, failed, root_folder_id, err
+ finally:
+ if root_folder_id:
+ await _clear_indexing_flag(session, root_folder_id)
# ====================================================================
# FULL-SCAN MODE
@@ -588,6 +615,7 @@ async def index_local_folder(
exclude_patterns=exclude_patterns,
)
await session.flush()
+ await _set_indexing_flag(session, root_folder_id)
try:
files = scan_folder(folder_path, file_extensions, exclude_patterns)
@@ -595,6 +623,7 @@ async def index_local_folder(
await task_logger.log_task_failure(
log_entry, f"Failed to scan folder: {e}", "Scan error", {}
)
+ await _clear_indexing_flag(session, root_folder_id)
return 0, 0, root_folder_id, f"Failed to scan folder: {e}"
logger.info(f"Found {len(files)} files in folder")
@@ -882,6 +911,7 @@ async def index_local_folder(
},
)
+ await _clear_indexing_flag(session, root_folder_id)
return indexed_count, skipped_count, root_folder_id, warning_message
except SQLAlchemyError as e:
@@ -890,6 +920,8 @@ async def index_local_folder(
await task_logger.log_task_failure(
log_entry, f"DB error: {e}", "Database error", {}
)
+ if root_folder_id:
+ await _clear_indexing_flag(session, root_folder_id)
return 0, 0, root_folder_id, f"Database error: {e}"
except Exception as e:
@@ -897,6 +929,8 @@ async def index_local_folder(
await task_logger.log_task_failure(
log_entry, f"Error: {e}", "Unexpected error", {}
)
+ if root_folder_id:
+ await _clear_indexing_flag(session, root_folder_id)
return 0, 0, root_folder_id, str(e)
@@ -1261,12 +1295,7 @@ async def index_uploaded_files(
)
await session.flush()
- root_folder = await session.get(Folder, root_folder_id)
- if root_folder:
- meta = dict(root_folder.folder_metadata or {})
- meta["indexing_in_progress"] = True
- root_folder.folder_metadata = meta
- await session.commit()
+ await _set_indexing_flag(session, root_folder_id)
page_limit_service = PageLimitService(session)
pipeline = IndexingPipelineService(session)
@@ -1443,12 +1472,4 @@ async def index_uploaded_files(
return 0, 0, str(e)
finally:
- try:
- root_folder = await session.get(Folder, root_folder_id)
- if root_folder:
- meta = dict(root_folder.folder_metadata or {})
- meta.pop("indexing_in_progress", None)
- root_folder.folder_metadata = meta
- await session.commit()
- except Exception:
- pass
+ await _clear_indexing_flag(session, root_folder_id)
diff --git a/surfsense_backend/tests/integration/indexing_pipeline/test_local_folder_pipeline.py b/surfsense_backend/tests/integration/indexing_pipeline/test_local_folder_pipeline.py
index 000f43aa8..1508fb26f 100644
--- a/surfsense_backend/tests/integration/indexing_pipeline/test_local_folder_pipeline.py
+++ b/surfsense_backend/tests/integration/indexing_pipeline/test_local_folder_pipeline.py
@@ -1,4 +1,4 @@
-"""Integration tests for local folder indexer — Tier 3 (I1-I5), Tier 4 (F1-F7), Tier 5 (P1), Tier 6 (B1-B2)."""
+"""Integration tests for local folder indexer — Tier 3 (I1-I5), Tier 4 (F1-F7), Tier 5 (P1), Tier 6 (B1-B2), Tier 7 (IP1-IP3)."""
import os
from contextlib import asynccontextmanager
@@ -1178,3 +1178,129 @@ class TestPageLimits:
await db_session.refresh(db_user)
assert db_user.pages_used > 0
assert db_user.pages_used <= db_user.pages_limit + 1
+
+
+# ====================================================================
+# Tier 7: Indexing Progress Flag (IP1-IP3)
+# ====================================================================
+
+
+class TestIndexingProgressFlag:
+ @pytest.mark.usefixtures(*UNIFIED_FIXTURES)
+ async def test_ip1_full_scan_clears_flag(
+ self,
+ db_session: AsyncSession,
+ db_user: User,
+ db_search_space: SearchSpace,
+ tmp_path: Path,
+ ):
+ """IP1: Full-scan mode clears indexing_in_progress after completion."""
+ from app.tasks.connector_indexers.local_folder_indexer import index_local_folder
+
+ (tmp_path / "note.md").write_text("# Hello\n\nContent.")
+
+ _, _, root_folder_id, _ = await index_local_folder(
+ session=db_session,
+ search_space_id=db_search_space.id,
+ user_id=str(db_user.id),
+ folder_path=str(tmp_path),
+ folder_name="test-folder",
+ )
+
+ assert root_folder_id is not None
+ root_folder = (
+ await db_session.execute(select(Folder).where(Folder.id == root_folder_id))
+ ).scalar_one()
+ meta = root_folder.folder_metadata or {}
+ assert "indexing_in_progress" not in meta
+
+ @pytest.mark.usefixtures(*UNIFIED_FIXTURES)
+ async def test_ip2_single_file_clears_flag(
+ self,
+ db_session: AsyncSession,
+ db_user: User,
+ db_search_space: SearchSpace,
+ tmp_path: Path,
+ ):
+ """IP2: Single-file (Chokidar) mode clears indexing_in_progress after completion."""
+ from app.tasks.connector_indexers.local_folder_indexer import index_local_folder
+
+ (tmp_path / "root.md").write_text("root")
+ _, _, root_folder_id, _ = await index_local_folder(
+ session=db_session,
+ search_space_id=db_search_space.id,
+ user_id=str(db_user.id),
+ folder_path=str(tmp_path),
+ folder_name="test-folder",
+ )
+
+ (tmp_path / "new.md").write_text("new file content")
+
+ await index_local_folder(
+ session=db_session,
+ search_space_id=db_search_space.id,
+ user_id=str(db_user.id),
+ folder_path=str(tmp_path),
+ folder_name="test-folder",
+ target_file_paths=[str(tmp_path / "new.md")],
+ root_folder_id=root_folder_id,
+ )
+
+ root_folder = (
+ await db_session.execute(select(Folder).where(Folder.id == root_folder_id))
+ ).scalar_one()
+ meta = root_folder.folder_metadata or {}
+ assert "indexing_in_progress" not in meta
+
+ @pytest.mark.usefixtures(*UNIFIED_FIXTURES)
+ async def test_ip3_flag_set_during_indexing(
+ self,
+ db_session: AsyncSession,
+ db_user: User,
+ db_search_space: SearchSpace,
+ tmp_path: Path,
+ ):
+ """IP3: indexing_in_progress is True on the root folder while indexing is running."""
+ from app.tasks.connector_indexers.local_folder_indexer import index_local_folder
+
+ (tmp_path / "note.md").write_text("# Check flag\n\nDuring indexing.")
+
+ from app.indexing_pipeline.indexing_pipeline_service import IndexingPipelineService
+
+ original_index = IndexingPipelineService.index
+ flag_observed = []
+
+ async def patched_index(self_pipe, document, connector_doc, llm):
+ folder = (
+ await db_session.execute(
+ select(Folder).where(
+ Folder.search_space_id == db_search_space.id,
+ Folder.parent_id.is_(None),
+ )
+ )
+ ).scalar_one_or_none()
+ if folder:
+ meta = folder.folder_metadata or {}
+ flag_observed.append(meta.get("indexing_in_progress", False))
+ return await original_index(self_pipe, document, connector_doc, llm)
+
+ IndexingPipelineService.index = patched_index
+ try:
+ _, _, root_folder_id, _ = await index_local_folder(
+ session=db_session,
+ search_space_id=db_search_space.id,
+ user_id=str(db_user.id),
+ folder_path=str(tmp_path),
+ folder_name="test-folder",
+ )
+ finally:
+ IndexingPipelineService.index = original_index
+
+ assert len(flag_observed) > 0, "index() should have been called at least once"
+ assert all(flag_observed), "indexing_in_progress should be True during indexing"
+
+ root_folder = (
+ await db_session.execute(select(Folder).where(Folder.id == root_folder_id))
+ ).scalar_one()
+ meta = root_folder.folder_metadata or {}
+ assert "indexing_in_progress" not in meta
diff --git a/surfsense_web/components/assistant-ui/tooltip-icon-button.tsx b/surfsense_web/components/assistant-ui/tooltip-icon-button.tsx
index 3db00e990..f003c55c0 100644
--- a/surfsense_web/components/assistant-ui/tooltip-icon-button.tsx
+++ b/surfsense_web/components/assistant-ui/tooltip-icon-button.tsx
@@ -1,7 +1,7 @@
"use client";
import { Slottable } from "@radix-ui/react-slot";
-import { type ComponentPropsWithRef, forwardRef, type ReactNode } from "react";
+import { type ComponentPropsWithRef, forwardRef, type ReactNode, useState } from "react";
import { Button } from "@/components/ui/button";
import { Tooltip, TooltipContent, TooltipTrigger } from "@/components/ui/tooltip";
import { useMediaQuery } from "@/hooks/use-media-query";
@@ -17,9 +17,10 @@ export const TooltipIconButton = forwardRef
{
const isTouchDevice = useMediaQuery("(pointer: coarse)");
const suppressTooltip = disableTooltip || isTouchDevice;
+ const [tooltipOpen, setTooltipOpen] = useState(false);
return (
-
+