feat: enhance document upload process with two-phase indexing and real-time status updates

2026-04-26 09:16:22 +02:00 · 2026-02-06 05:15:47 +05:30 · 2026-02-06 05:15:47 +05:30 · ed2fc5c636
commit ed2fc5c636
parent f56f5a281e
3 changed files with 694 additions and 11 deletions
--- a/surfsense_backend/app/routes/documents_routes.py
+++ b/surfsense_backend/app/routes/documents_routes.py
@ -113,9 +113,23 @@ async def create_documents_file_upload(
    user: User = Depends(current_active_user),
 ):
    """
-    Upload files as documents.
+    Upload files as documents with real-time status tracking.
+    
+    Implements 2-phase document status updates for real-time UI feedback:
+    - Phase 1: Create all documents with 'pending' status (visible in UI immediately via ElectricSQL)
+    - Phase 2: Celery processes each file: pending → processing → ready/failed
+    
    Requires DOCUMENTS_CREATE permission.
    """
+    from datetime import datetime
+
+    from app.db import DocumentStatus
+    from app.tasks.document_processors.base import (
+        check_document_by_unique_identifier,
+        get_current_timestamp,
+    )
+    from app.utils.document_converters import generate_unique_identifier_hash
+
    try:
        # Check permission
        await check_permission(
@ -129,38 +143,101 @@ async def create_documents_file_upload(
        if not files:
            raise HTTPException(status_code=400, detail="No files provided")

+        created_documents: list[Document] = []
+        files_to_process: list[tuple[Document, str, str]] = []  # (document, temp_path, filename)
+        skipped_duplicates = 0
+
+        # ===== PHASE 1: Create pending documents for all files =====
+        # This makes ALL documents visible in the UI immediately with pending status
        for file in files:
            try:
-                # Save file to a temporary location to avoid stream issues
                import os
                import tempfile

-                # Create temp file
+                # Save file to temp location
                with tempfile.NamedTemporaryFile(
-                    delete=False, suffix=os.path.splitext(file.filename)[1]
+                    delete=False, suffix=os.path.splitext(file.filename or "")[1]
                ) as temp_file:
                    temp_path = temp_file.name

-                # Write uploaded file to temp file
                content = await file.read()
                with open(temp_path, "wb") as f:
                    f.write(content)

-                from app.tasks.celery_tasks.document_tasks import (
-                    process_file_upload_task,
+                file_size = len(content)
+
+                # Generate unique identifier for deduplication check
+                unique_identifier_hash = generate_unique_identifier_hash(
+                    DocumentType.FILE, file.filename or "unknown", search_space_id
                )

-                process_file_upload_task.delay(
-                    temp_path, file.filename, search_space_id, str(user.id)
+                # Check if document already exists (by unique identifier)
+                existing = await check_document_by_unique_identifier(
+                    session, unique_identifier_hash
                )
+                if existing:
+                    # Clean up temp file for duplicates
+                    os.unlink(temp_path)
+                    skipped_duplicates += 1
+                    continue
+
+                # Create pending document (visible immediately in UI via ElectricSQL)
+                document = Document(
+                    search_space_id=search_space_id,
+                    title=file.filename or "Uploaded File",
+                    document_type=DocumentType.FILE,
+                    document_metadata={
+                        "FILE_NAME": file.filename,
+                        "file_size": file_size,
+                        "upload_time": datetime.now().isoformat(),
+                    },
+                    content="Processing...",  # Placeholder until processed
+                    content_hash=unique_identifier_hash,  # Temporary, updated when ready
+                    unique_identifier_hash=unique_identifier_hash,
+                    embedding=None,
+                    status=DocumentStatus.pending(),  # Shows "pending" in UI
+                    updated_at=get_current_timestamp(),
+                    created_by_id=str(user.id),
+                )
+                session.add(document)
+                created_documents.append(document)
+                files_to_process.append((document, temp_path, file.filename or "unknown"))
+
            except Exception as e:
                raise HTTPException(
                    status_code=422,
                    detail=f"Failed to process file {file.filename}: {e!s}",
                ) from e

-        await session.commit()
-        return {"message": "Files uploaded for processing"}
+        # Commit all pending documents - they appear in UI immediately via ElectricSQL
+        if created_documents:
+            await session.commit()
+            # Refresh to get generated IDs
+            for doc in created_documents:
+                await session.refresh(doc)
+
+        # ===== PHASE 2: Dispatch Celery tasks for each file =====
+        # Each task will update document status: pending → processing → ready/failed
+        from app.tasks.celery_tasks.document_tasks import (
+            process_file_upload_with_document_task,
+        )
+
+        for document, temp_path, filename in files_to_process:
+            process_file_upload_with_document_task.delay(
+                document_id=document.id,
+                temp_path=temp_path,
+                filename=filename,
+                search_space_id=search_space_id,
+                user_id=str(user.id),
+            )
+
+        return {
+            "message": "Files uploaded for processing",
+            "document_ids": [doc.id for doc in created_documents],
+            "total_files": len(files),
+            "pending_files": len(files_to_process),
+            "skipped_duplicates": skipped_duplicates,
+        }
    except HTTPException:
        raise
    except Exception as e: