feat: enhance document upload process with two-phase indexing and real-time status updates

This commit is contained in:
Anish Sarkar 2026-02-06 05:15:47 +05:30
parent f56f5a281e
commit ed2fc5c636
3 changed files with 694 additions and 11 deletions

View file

@ -113,9 +113,23 @@ async def create_documents_file_upload(
user: User = Depends(current_active_user),
):
"""
Upload files as documents.
Upload files as documents with real-time status tracking.
Implements 2-phase document status updates for real-time UI feedback:
- Phase 1: Create all documents with 'pending' status (visible in UI immediately via ElectricSQL)
- Phase 2: Celery processes each file: pending processing ready/failed
Requires DOCUMENTS_CREATE permission.
"""
from datetime import datetime
from app.db import DocumentStatus
from app.tasks.document_processors.base import (
check_document_by_unique_identifier,
get_current_timestamp,
)
from app.utils.document_converters import generate_unique_identifier_hash
try:
# Check permission
await check_permission(
@ -129,38 +143,101 @@ async def create_documents_file_upload(
if not files:
raise HTTPException(status_code=400, detail="No files provided")
created_documents: list[Document] = []
files_to_process: list[tuple[Document, str, str]] = [] # (document, temp_path, filename)
skipped_duplicates = 0
# ===== PHASE 1: Create pending documents for all files =====
# This makes ALL documents visible in the UI immediately with pending status
for file in files:
try:
# Save file to a temporary location to avoid stream issues
import os
import tempfile
# Create temp file
# Save file to temp location
with tempfile.NamedTemporaryFile(
delete=False, suffix=os.path.splitext(file.filename)[1]
delete=False, suffix=os.path.splitext(file.filename or "")[1]
) as temp_file:
temp_path = temp_file.name
# Write uploaded file to temp file
content = await file.read()
with open(temp_path, "wb") as f:
f.write(content)
from app.tasks.celery_tasks.document_tasks import (
process_file_upload_task,
file_size = len(content)
# Generate unique identifier for deduplication check
unique_identifier_hash = generate_unique_identifier_hash(
DocumentType.FILE, file.filename or "unknown", search_space_id
)
process_file_upload_task.delay(
temp_path, file.filename, search_space_id, str(user.id)
# Check if document already exists (by unique identifier)
existing = await check_document_by_unique_identifier(
session, unique_identifier_hash
)
if existing:
# Clean up temp file for duplicates
os.unlink(temp_path)
skipped_duplicates += 1
continue
# Create pending document (visible immediately in UI via ElectricSQL)
document = Document(
search_space_id=search_space_id,
title=file.filename or "Uploaded File",
document_type=DocumentType.FILE,
document_metadata={
"FILE_NAME": file.filename,
"file_size": file_size,
"upload_time": datetime.now().isoformat(),
},
content="Processing...", # Placeholder until processed
content_hash=unique_identifier_hash, # Temporary, updated when ready
unique_identifier_hash=unique_identifier_hash,
embedding=None,
status=DocumentStatus.pending(), # Shows "pending" in UI
updated_at=get_current_timestamp(),
created_by_id=str(user.id),
)
session.add(document)
created_documents.append(document)
files_to_process.append((document, temp_path, file.filename or "unknown"))
except Exception as e:
raise HTTPException(
status_code=422,
detail=f"Failed to process file {file.filename}: {e!s}",
) from e
await session.commit()
return {"message": "Files uploaded for processing"}
# Commit all pending documents - they appear in UI immediately via ElectricSQL
if created_documents:
await session.commit()
# Refresh to get generated IDs
for doc in created_documents:
await session.refresh(doc)
# ===== PHASE 2: Dispatch Celery tasks for each file =====
# Each task will update document status: pending → processing → ready/failed
from app.tasks.celery_tasks.document_tasks import (
process_file_upload_with_document_task,
)
for document, temp_path, filename in files_to_process:
process_file_upload_with_document_task.delay(
document_id=document.id,
temp_path=temp_path,
filename=filename,
search_space_id=search_space_id,
user_id=str(user.id),
)
return {
"message": "Files uploaded for processing",
"document_ids": [doc.id for doc in created_documents],
"total_files": len(files),
"pending_files": len(files_to_process),
"skipped_duplicates": skipped_duplicates,
}
except HTTPException:
raise
except Exception as e: