feat: simplified document upload handling

- Introduced a new endpoint for batch document status retrieval, allowing users to check the status of multiple documents in a search space. - Enhanced the document upload process to return duplicate document IDs and improved response structure. - Updated schemas to include new response models for document status. - Removed unused attachment processing code from chat routes and UI components to streamline functionality.
2026-04-30 03:16:25 +02:00 · 2026-02-09 16:46:54 -08:00 · 2026-02-09 16:46:54 -08:00 · c979609041
commit c979609041
parent d11e76aaa1
15 changed files with 475 additions and 1090 deletions
--- a/surfsense_backend/app/routes/documents_routes.py
+++ b/surfsense_backend/app/routes/documents_routes.py
@ -18,6 +18,8 @@ from app.db import (
 )
 from app.schemas import (
    DocumentRead,
+    DocumentStatusBatchResponse,
+    DocumentStatusItemRead,
    DocumentsCreate,
    DocumentStatusSchema,
    DocumentTitleRead,
@ -148,6 +150,7 @@ async def create_documents_file_upload(
            tuple[Document, str, str]
        ] = []  # (document, temp_path, filename)
        skipped_duplicates = 0
+        duplicate_document_ids: list[int] = []

        # ===== PHASE 1: Create pending documents for all files =====
        # This makes ALL documents visible in the UI immediately with pending status
@ -182,6 +185,7 @@ async def create_documents_file_upload(
                        # True duplicate — content already indexed, skip
                        os.unlink(temp_path)
                        skipped_duplicates += 1
+                        duplicate_document_ids.append(existing.id)
                        continue

                    # Existing document is stuck (failed/pending/processing)
@ -255,6 +259,7 @@ async def create_documents_file_upload(
        return {
            "message": "Files uploaded for processing",
            "document_ids": [doc.id for doc in created_documents],
+            "duplicate_document_ids": duplicate_document_ids,
            "total_files": len(files),
            "pending_files": len(files_to_process),
            "skipped_duplicates": skipped_duplicates,
@ -678,6 +683,74 @@ async def search_document_titles(
        ) from e


+@router.get("/documents/status", response_model=DocumentStatusBatchResponse)
+async def get_documents_status(
+    search_space_id: int,
+    document_ids: str,
+    session: AsyncSession = Depends(get_async_session),
+    user: User = Depends(current_active_user),
+):
+    """
+    Batch status endpoint for documents in a search space.
+
+    Returns lightweight status info for the provided document IDs, intended for
+    polling async ETL progress in chat upload flows.
+    """
+    try:
+        await check_permission(
+            session,
+            user,
+            search_space_id,
+            Permission.DOCUMENTS_READ.value,
+            "You don't have permission to read documents in this search space",
+        )
+
+        # Parse comma-separated IDs (e.g. "1,2,3")
+        parsed_ids = []
+        for raw_id in document_ids.split(","):
+            value = raw_id.strip()
+            if not value:
+                continue
+            try:
+                parsed_ids.append(int(value))
+            except ValueError:
+                raise HTTPException(
+                    status_code=400,
+                    detail=f"Invalid document id: {value}",
+                ) from None
+
+        if not parsed_ids:
+            return DocumentStatusBatchResponse(items=[])
+
+        result = await session.execute(
+            select(Document).filter(
+                Document.search_space_id == search_space_id,
+                Document.id.in_(parsed_ids),
+            )
+        )
+        docs = result.scalars().all()
+
+        items = [
+            DocumentStatusItemRead(
+                id=doc.id,
+                title=doc.title,
+                document_type=doc.document_type,
+                status=DocumentStatusSchema(
+                    state=(doc.status or {}).get("state", "ready"),
+                    reason=(doc.status or {}).get("reason"),
+                ),
+            )
+            for doc in docs
+        ]
+        return DocumentStatusBatchResponse(items=items)
+    except HTTPException:
+        raise
+    except Exception as e:
+        raise HTTPException(
+            status_code=500, detail=f"Failed to fetch document status: {e!s}"
+        ) from e
+
+
@router.get("/documents/type-counts")
 async def get_document_type_counts(
    search_space_id: int | None = None,