mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-04-26 09:16:22 +02:00
feat: enhance document upload process with two-phase indexing and real-time status updates
This commit is contained in:
parent
f56f5a281e
commit
ed2fc5c636
3 changed files with 694 additions and 11 deletions
|
|
@ -113,9 +113,23 @@ async def create_documents_file_upload(
|
|||
user: User = Depends(current_active_user),
|
||||
):
|
||||
"""
|
||||
Upload files as documents.
|
||||
Upload files as documents with real-time status tracking.
|
||||
|
||||
Implements 2-phase document status updates for real-time UI feedback:
|
||||
- Phase 1: Create all documents with 'pending' status (visible in UI immediately via ElectricSQL)
|
||||
- Phase 2: Celery processes each file: pending → processing → ready/failed
|
||||
|
||||
Requires DOCUMENTS_CREATE permission.
|
||||
"""
|
||||
from datetime import datetime
|
||||
|
||||
from app.db import DocumentStatus
|
||||
from app.tasks.document_processors.base import (
|
||||
check_document_by_unique_identifier,
|
||||
get_current_timestamp,
|
||||
)
|
||||
from app.utils.document_converters import generate_unique_identifier_hash
|
||||
|
||||
try:
|
||||
# Check permission
|
||||
await check_permission(
|
||||
|
|
@ -129,38 +143,101 @@ async def create_documents_file_upload(
|
|||
if not files:
|
||||
raise HTTPException(status_code=400, detail="No files provided")
|
||||
|
||||
created_documents: list[Document] = []
|
||||
files_to_process: list[tuple[Document, str, str]] = [] # (document, temp_path, filename)
|
||||
skipped_duplicates = 0
|
||||
|
||||
# ===== PHASE 1: Create pending documents for all files =====
|
||||
# This makes ALL documents visible in the UI immediately with pending status
|
||||
for file in files:
|
||||
try:
|
||||
# Save file to a temporary location to avoid stream issues
|
||||
import os
|
||||
import tempfile
|
||||
|
||||
# Create temp file
|
||||
# Save file to temp location
|
||||
with tempfile.NamedTemporaryFile(
|
||||
delete=False, suffix=os.path.splitext(file.filename)[1]
|
||||
delete=False, suffix=os.path.splitext(file.filename or "")[1]
|
||||
) as temp_file:
|
||||
temp_path = temp_file.name
|
||||
|
||||
# Write uploaded file to temp file
|
||||
content = await file.read()
|
||||
with open(temp_path, "wb") as f:
|
||||
f.write(content)
|
||||
|
||||
from app.tasks.celery_tasks.document_tasks import (
|
||||
process_file_upload_task,
|
||||
file_size = len(content)
|
||||
|
||||
# Generate unique identifier for deduplication check
|
||||
unique_identifier_hash = generate_unique_identifier_hash(
|
||||
DocumentType.FILE, file.filename or "unknown", search_space_id
|
||||
)
|
||||
|
||||
process_file_upload_task.delay(
|
||||
temp_path, file.filename, search_space_id, str(user.id)
|
||||
# Check if document already exists (by unique identifier)
|
||||
existing = await check_document_by_unique_identifier(
|
||||
session, unique_identifier_hash
|
||||
)
|
||||
if existing:
|
||||
# Clean up temp file for duplicates
|
||||
os.unlink(temp_path)
|
||||
skipped_duplicates += 1
|
||||
continue
|
||||
|
||||
# Create pending document (visible immediately in UI via ElectricSQL)
|
||||
document = Document(
|
||||
search_space_id=search_space_id,
|
||||
title=file.filename or "Uploaded File",
|
||||
document_type=DocumentType.FILE,
|
||||
document_metadata={
|
||||
"FILE_NAME": file.filename,
|
||||
"file_size": file_size,
|
||||
"upload_time": datetime.now().isoformat(),
|
||||
},
|
||||
content="Processing...", # Placeholder until processed
|
||||
content_hash=unique_identifier_hash, # Temporary, updated when ready
|
||||
unique_identifier_hash=unique_identifier_hash,
|
||||
embedding=None,
|
||||
status=DocumentStatus.pending(), # Shows "pending" in UI
|
||||
updated_at=get_current_timestamp(),
|
||||
created_by_id=str(user.id),
|
||||
)
|
||||
session.add(document)
|
||||
created_documents.append(document)
|
||||
files_to_process.append((document, temp_path, file.filename or "unknown"))
|
||||
|
||||
except Exception as e:
|
||||
raise HTTPException(
|
||||
status_code=422,
|
||||
detail=f"Failed to process file {file.filename}: {e!s}",
|
||||
) from e
|
||||
|
||||
await session.commit()
|
||||
return {"message": "Files uploaded for processing"}
|
||||
# Commit all pending documents - they appear in UI immediately via ElectricSQL
|
||||
if created_documents:
|
||||
await session.commit()
|
||||
# Refresh to get generated IDs
|
||||
for doc in created_documents:
|
||||
await session.refresh(doc)
|
||||
|
||||
# ===== PHASE 2: Dispatch Celery tasks for each file =====
|
||||
# Each task will update document status: pending → processing → ready/failed
|
||||
from app.tasks.celery_tasks.document_tasks import (
|
||||
process_file_upload_with_document_task,
|
||||
)
|
||||
|
||||
for document, temp_path, filename in files_to_process:
|
||||
process_file_upload_with_document_task.delay(
|
||||
document_id=document.id,
|
||||
temp_path=temp_path,
|
||||
filename=filename,
|
||||
search_space_id=search_space_id,
|
||||
user_id=str(user.id),
|
||||
)
|
||||
|
||||
return {
|
||||
"message": "Files uploaded for processing",
|
||||
"document_ids": [doc.id for doc in created_documents],
|
||||
"total_files": len(files),
|
||||
"pending_files": len(files_to_process),
|
||||
"skipped_duplicates": skipped_duplicates,
|
||||
}
|
||||
except HTTPException:
|
||||
raise
|
||||
except Exception as e:
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue