mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-04-27 09:46:25 +02:00
feat: implement upload-based folder indexing and synchronization features
This commit is contained in:
parent
b3925654dd
commit
5f5954e932
13 changed files with 1273 additions and 45 deletions
|
|
@ -1543,3 +1543,379 @@ async def folder_index_files(
|
|||
"status": "processing",
|
||||
"file_count": len(request.target_file_paths),
|
||||
}
|
||||
|
||||
|
||||
# ===== Upload-based local folder indexing endpoints =====
|
||||
# These work for ALL deployment modes (cloud, self-hosted remote, self-hosted local).
|
||||
# The desktop app reads files locally and uploads them here.
|
||||
|
||||
|
||||
class FolderMtimeCheckFile(PydanticBaseModel):
|
||||
relative_path: str
|
||||
mtime: float
|
||||
|
||||
|
||||
class FolderMtimeCheckRequest(PydanticBaseModel):
|
||||
folder_name: str
|
||||
search_space_id: int
|
||||
files: list[FolderMtimeCheckFile]
|
||||
|
||||
|
||||
class FolderUnlinkRequest(PydanticBaseModel):
|
||||
folder_name: str
|
||||
search_space_id: int
|
||||
root_folder_id: int | None = None
|
||||
relative_paths: list[str]
|
||||
|
||||
|
||||
class FolderSyncFinalizeRequest(PydanticBaseModel):
|
||||
folder_name: str
|
||||
search_space_id: int
|
||||
root_folder_id: int | None = None
|
||||
all_relative_paths: list[str]
|
||||
|
||||
|
||||
@router.post("/documents/folder-mtime-check")
|
||||
async def folder_mtime_check(
|
||||
request: FolderMtimeCheckRequest,
|
||||
session: AsyncSession = Depends(get_async_session),
|
||||
user: User = Depends(current_active_user),
|
||||
):
|
||||
"""Pre-upload optimization: check which files need uploading based on mtime.
|
||||
|
||||
Returns the subset of relative paths where the file is new or has a
|
||||
different mtime, so the client can skip reading/uploading unchanged files.
|
||||
"""
|
||||
from app.indexing_pipeline.document_hashing import compute_identifier_hash
|
||||
|
||||
await check_permission(
|
||||
session,
|
||||
user,
|
||||
request.search_space_id,
|
||||
Permission.DOCUMENTS_CREATE.value,
|
||||
"You don't have permission to create documents in this search space",
|
||||
)
|
||||
|
||||
uid_hashes = {}
|
||||
for f in request.files:
|
||||
uid = f"{request.folder_name}:{f.relative_path}"
|
||||
uid_hash = compute_identifier_hash(
|
||||
DocumentType.LOCAL_FOLDER_FILE.value, uid, request.search_space_id
|
||||
)
|
||||
uid_hashes[uid_hash] = f
|
||||
|
||||
existing_docs = (
|
||||
(
|
||||
await session.execute(
|
||||
select(Document).where(
|
||||
Document.unique_identifier_hash.in_(list(uid_hashes.keys())),
|
||||
Document.document_type == DocumentType.LOCAL_FOLDER_FILE,
|
||||
)
|
||||
)
|
||||
)
|
||||
.scalars()
|
||||
.all()
|
||||
)
|
||||
|
||||
existing_by_hash = {doc.unique_identifier_hash: doc for doc in existing_docs}
|
||||
|
||||
MTIME_TOLERANCE = 1.0
|
||||
files_to_upload: list[str] = []
|
||||
|
||||
for uid_hash, file_info in uid_hashes.items():
|
||||
doc = existing_by_hash.get(uid_hash)
|
||||
if doc is None:
|
||||
files_to_upload.append(file_info.relative_path)
|
||||
continue
|
||||
|
||||
stored_mtime = (doc.document_metadata or {}).get("mtime")
|
||||
if stored_mtime is None:
|
||||
files_to_upload.append(file_info.relative_path)
|
||||
continue
|
||||
|
||||
if abs(file_info.mtime - stored_mtime) >= MTIME_TOLERANCE:
|
||||
files_to_upload.append(file_info.relative_path)
|
||||
|
||||
return {"files_to_upload": files_to_upload}
|
||||
|
||||
|
||||
@router.post("/documents/folder-upload")
|
||||
async def folder_upload(
|
||||
files: list[UploadFile],
|
||||
folder_name: str = Form(...),
|
||||
search_space_id: int = Form(...),
|
||||
relative_paths: str = Form(...),
|
||||
root_folder_id: int | None = Form(None),
|
||||
enable_summary: bool = Form(False),
|
||||
session: AsyncSession = Depends(get_async_session),
|
||||
user: User = Depends(current_active_user),
|
||||
):
|
||||
"""Upload files from the desktop app for folder indexing.
|
||||
|
||||
Files are written to temp storage and dispatched to a Celery task.
|
||||
Works for all deployment modes (no is_self_hosted guard).
|
||||
"""
|
||||
import json
|
||||
import tempfile
|
||||
|
||||
await check_permission(
|
||||
session,
|
||||
user,
|
||||
search_space_id,
|
||||
Permission.DOCUMENTS_CREATE.value,
|
||||
"You don't have permission to create documents in this search space",
|
||||
)
|
||||
|
||||
if not files:
|
||||
raise HTTPException(status_code=400, detail="No files provided")
|
||||
|
||||
try:
|
||||
rel_paths: list[str] = json.loads(relative_paths)
|
||||
except (json.JSONDecodeError, TypeError) as e:
|
||||
raise HTTPException(
|
||||
status_code=400, detail=f"Invalid relative_paths JSON: {e}"
|
||||
) from e
|
||||
|
||||
if len(rel_paths) != len(files):
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail=f"Mismatch: {len(files)} files but {len(rel_paths)} relative_paths",
|
||||
)
|
||||
|
||||
for file in files:
|
||||
file_size = file.size or 0
|
||||
if file_size > MAX_FILE_SIZE_BYTES:
|
||||
raise HTTPException(
|
||||
status_code=413,
|
||||
detail=f"File '{file.filename}' ({file_size / (1024 * 1024):.1f} MB) "
|
||||
f"exceeds the {MAX_FILE_SIZE_BYTES // (1024 * 1024)} MB per-file limit.",
|
||||
)
|
||||
|
||||
if not root_folder_id:
|
||||
watched_metadata = {
|
||||
"watched": True,
|
||||
"folder_path": folder_name,
|
||||
}
|
||||
existing_root = (
|
||||
await session.execute(
|
||||
select(Folder).where(
|
||||
Folder.name == folder_name,
|
||||
Folder.parent_id.is_(None),
|
||||
Folder.search_space_id == search_space_id,
|
||||
)
|
||||
)
|
||||
).scalar_one_or_none()
|
||||
|
||||
if existing_root:
|
||||
root_folder_id = existing_root.id
|
||||
existing_root.folder_metadata = watched_metadata
|
||||
else:
|
||||
root_folder = Folder(
|
||||
name=folder_name,
|
||||
search_space_id=search_space_id,
|
||||
created_by_id=str(user.id),
|
||||
position="a0",
|
||||
folder_metadata=watched_metadata,
|
||||
)
|
||||
session.add(root_folder)
|
||||
await session.flush()
|
||||
root_folder_id = root_folder.id
|
||||
|
||||
await session.commit()
|
||||
|
||||
async def _read_and_save(file: UploadFile, idx: int) -> dict:
|
||||
content = await file.read()
|
||||
filename = file.filename or rel_paths[idx].split("/")[-1]
|
||||
|
||||
def _write_temp() -> str:
|
||||
with tempfile.NamedTemporaryFile(
|
||||
delete=False, suffix=os.path.splitext(filename)[1]
|
||||
) as tmp:
|
||||
tmp.write(content)
|
||||
return tmp.name
|
||||
|
||||
temp_path = await asyncio.to_thread(_write_temp)
|
||||
return {
|
||||
"temp_path": temp_path,
|
||||
"relative_path": rel_paths[idx],
|
||||
"filename": filename,
|
||||
}
|
||||
|
||||
file_mappings = await asyncio.gather(
|
||||
*(_read_and_save(f, i) for i, f in enumerate(files))
|
||||
)
|
||||
|
||||
from app.tasks.celery_tasks.document_tasks import (
|
||||
index_uploaded_folder_files_task,
|
||||
)
|
||||
|
||||
index_uploaded_folder_files_task.delay(
|
||||
search_space_id=search_space_id,
|
||||
user_id=str(user.id),
|
||||
folder_name=folder_name,
|
||||
root_folder_id=root_folder_id,
|
||||
enable_summary=enable_summary,
|
||||
file_mappings=list(file_mappings),
|
||||
)
|
||||
|
||||
return {
|
||||
"message": f"Folder upload started for {len(files)} file(s)",
|
||||
"status": "processing",
|
||||
"root_folder_id": root_folder_id,
|
||||
"file_count": len(files),
|
||||
}
|
||||
|
||||
|
||||
@router.post("/documents/folder-unlink")
|
||||
async def folder_unlink(
|
||||
request: FolderUnlinkRequest,
|
||||
session: AsyncSession = Depends(get_async_session),
|
||||
user: User = Depends(current_active_user),
|
||||
):
|
||||
"""Handle file deletion events from the desktop watcher.
|
||||
|
||||
For each relative path, find the matching document and delete it.
|
||||
"""
|
||||
from app.indexing_pipeline.document_hashing import compute_identifier_hash
|
||||
from app.tasks.connector_indexers.local_folder_indexer import (
|
||||
_cleanup_empty_folder_chain,
|
||||
)
|
||||
|
||||
await check_permission(
|
||||
session,
|
||||
user,
|
||||
request.search_space_id,
|
||||
Permission.DOCUMENTS_DELETE.value,
|
||||
"You don't have permission to delete documents in this search space",
|
||||
)
|
||||
|
||||
deleted_count = 0
|
||||
|
||||
for rel_path in request.relative_paths:
|
||||
unique_id = f"{request.folder_name}:{rel_path}"
|
||||
uid_hash = compute_identifier_hash(
|
||||
DocumentType.LOCAL_FOLDER_FILE.value,
|
||||
unique_id,
|
||||
request.search_space_id,
|
||||
)
|
||||
|
||||
existing = (
|
||||
await session.execute(
|
||||
select(Document).where(
|
||||
Document.unique_identifier_hash == uid_hash
|
||||
)
|
||||
)
|
||||
).scalar_one_or_none()
|
||||
|
||||
if existing:
|
||||
deleted_folder_id = existing.folder_id
|
||||
await session.delete(existing)
|
||||
await session.flush()
|
||||
|
||||
if deleted_folder_id and request.root_folder_id:
|
||||
await _cleanup_empty_folder_chain(
|
||||
session, deleted_folder_id, request.root_folder_id
|
||||
)
|
||||
deleted_count += 1
|
||||
|
||||
await session.commit()
|
||||
return {"deleted_count": deleted_count}
|
||||
|
||||
|
||||
@router.post("/documents/folder-sync-finalize")
|
||||
async def folder_sync_finalize(
|
||||
request: FolderSyncFinalizeRequest,
|
||||
session: AsyncSession = Depends(get_async_session),
|
||||
user: User = Depends(current_active_user),
|
||||
):
|
||||
"""Finalize a full folder scan by deleting orphaned documents.
|
||||
|
||||
The client sends the complete list of relative paths currently in the
|
||||
folder. Any document in the DB for this folder that is NOT in the list
|
||||
gets deleted.
|
||||
"""
|
||||
from app.indexing_pipeline.document_hashing import compute_identifier_hash
|
||||
from app.tasks.connector_indexers.local_folder_indexer import (
|
||||
_cleanup_empty_folders,
|
||||
)
|
||||
|
||||
await check_permission(
|
||||
session,
|
||||
user,
|
||||
request.search_space_id,
|
||||
Permission.DOCUMENTS_DELETE.value,
|
||||
"You don't have permission to delete documents in this search space",
|
||||
)
|
||||
|
||||
seen_hashes: set[str] = set()
|
||||
for rel_path in request.all_relative_paths:
|
||||
unique_id = f"{request.folder_name}:{rel_path}"
|
||||
uid_hash = compute_identifier_hash(
|
||||
DocumentType.LOCAL_FOLDER_FILE.value,
|
||||
unique_id,
|
||||
request.search_space_id,
|
||||
)
|
||||
seen_hashes.add(uid_hash)
|
||||
|
||||
all_root_folder_ids: set[int] = set()
|
||||
if request.root_folder_id:
|
||||
all_root_folder_ids.add(request.root_folder_id)
|
||||
|
||||
all_db_folders = (
|
||||
(
|
||||
await session.execute(
|
||||
select(Folder.id).where(
|
||||
Folder.search_space_id == request.search_space_id,
|
||||
)
|
||||
)
|
||||
)
|
||||
.scalars()
|
||||
.all()
|
||||
)
|
||||
all_root_folder_ids.update(all_db_folders)
|
||||
|
||||
all_folder_docs = (
|
||||
(
|
||||
await session.execute(
|
||||
select(Document).where(
|
||||
Document.document_type == DocumentType.LOCAL_FOLDER_FILE,
|
||||
Document.search_space_id == request.search_space_id,
|
||||
Document.folder_id.in_(list(all_root_folder_ids))
|
||||
if all_root_folder_ids
|
||||
else True,
|
||||
)
|
||||
)
|
||||
)
|
||||
.scalars()
|
||||
.all()
|
||||
)
|
||||
|
||||
deleted_count = 0
|
||||
for doc in all_folder_docs:
|
||||
if doc.unique_identifier_hash not in seen_hashes:
|
||||
await session.delete(doc)
|
||||
deleted_count += 1
|
||||
|
||||
await session.flush()
|
||||
|
||||
if request.root_folder_id:
|
||||
existing_dirs: set[str] = set()
|
||||
for rel_path in request.all_relative_paths:
|
||||
parent = str(os.path.dirname(rel_path))
|
||||
if parent and parent != ".":
|
||||
existing_dirs.add(parent)
|
||||
|
||||
folder_mapping: dict[str, int] = {}
|
||||
if request.root_folder_id:
|
||||
folder_mapping[""] = request.root_folder_id
|
||||
|
||||
await _cleanup_empty_folders(
|
||||
session,
|
||||
request.root_folder_id,
|
||||
request.search_space_id,
|
||||
existing_dirs,
|
||||
folder_mapping,
|
||||
)
|
||||
|
||||
await session.commit()
|
||||
return {"deleted_count": deleted_count}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue