feat: add document status management with JSONB column for processing states in documents

2026-06-08 20:25:19 +02:00 · 2026-02-05 21:59:31 +05:30 · 2026-02-05 21:59:31 +05:30 · aef59d04eb
commit aef59d04eb
parent 04884caeef
13 changed files with 526 additions and 135 deletions
--- a/surfsense_backend/alembic/versions/92_add_document_status_column.py
+++ b/surfsense_backend/alembic/versions/92_add_document_status_column.py
@ -0,0 +1,80 @@
+"""Add status column to documents table for per-document processing status
+
+Revision ID: 92
+Revises: 91
+Create Date: 2026-02-05
+
+Changes:
+1. Add status column (JSONB) to documents table
+2. Default value is {"state": "ready"} for backward compatibility
+3. Existing documents are set to ready status
+4. Index created for efficient status filtering
+"""
+
+from collections.abc import Sequence
+
+import sqlalchemy as sa
+
+from alembic import op
+
+# revision identifiers, used by Alembic.
+revision: str = "92"
+down_revision: str | None = "91"
+branch_labels: str | Sequence[str] | None = None
+depends_on: str | Sequence[str] | None = None
+
+
+def upgrade() -> None:
+    """Add status column to documents with default ready state."""
+
+    # 1. Add status column with default value for new rows
+    op.execute(
+        """
+        DO $$
+        BEGIN
+            IF NOT EXISTS (
+                SELECT 1 FROM information_schema.columns
+                WHERE table_name = 'documents' AND column_name = 'status'
+            ) THEN
+                ALTER TABLE documents
+                ADD COLUMN status JSONB NOT NULL DEFAULT '{"state": "ready"}'::jsonb;
+            END IF;
+        END$$;
+        """
+    )
+
+    # 2. Create index on status for efficient filtering by state
+    op.execute(
+        """
+        CREATE INDEX IF NOT EXISTS ix_documents_status
+        ON documents ((status->>'state'));
+        """
+    )
+
+
+def downgrade() -> None:
+    """Remove status column from documents."""
+
+    # Drop index
+    op.execute(
+        """
+        DROP INDEX IF EXISTS ix_documents_status;
+        """
+    )
+
+    # Drop column
+    op.execute(
+        """
+        DO $$
+        BEGIN
+            IF EXISTS (
+                SELECT 1 FROM information_schema.columns
+                WHERE table_name = 'documents' AND column_name = 'status'
+            ) THEN
+                ALTER TABLE documents
+                DROP COLUMN status;
+            END IF;
+        END$$;
+        """
+    )
+
--- a/surfsense_backend/app/connectors/composio_google_calendar_connector.py
+++ b/surfsense_backend/app/connectors/composio_google_calendar_connector.py
@ -16,13 +16,14 @@ from sqlalchemy.orm import selectinload

 from app.config import config
 from app.connectors.composio_connector import ComposioConnector
-from app.db import Document, DocumentType
+from app.db import Document, DocumentStatus, DocumentType
 from app.services.composio_service import TOOLKIT_TO_DOCUMENT_TYPE
 from app.services.llm_service import get_user_long_context_llm
 from app.services.task_logging_service import TaskLoggingService
 from app.tasks.connector_indexers.base import (
    calculate_date_range,
    check_duplicate_document_by_hash,
+    safe_set_chunks,
 )
 from app.utils.document_converters import (
    create_document_chunks,
@ -266,18 +267,18 @@ async def index_composio_google_calendar(

        documents_indexed = 0
        documents_skipped = 0
-        duplicate_content_count = (
-            0  # Track events skipped due to duplicate content_hash
-        )
+        documents_failed = 0  # Track events that failed processing
+        duplicate_content_count = 0  # Track events skipped due to duplicate content_hash
        last_heartbeat_time = time.time()

+        # =======================================================================
+        # PHASE 1: Analyze all events, create pending documents
+        # This makes ALL documents visible in the UI immediately with pending status
+        # =======================================================================
+        events_to_process = []  # List of dicts with document and event data
+        new_documents_created = False
+
        for event in events:
-            # Send heartbeat periodically to indicate task is still alive
-            if on_heartbeat_callback:
-                current_time = time.time()
-                if current_time - last_heartbeat_time >= HEARTBEAT_INTERVAL_SECONDS:
-                    await on_heartbeat_callback(documents_indexed)
-                    last_heartbeat_time = current_time
            try:
                # Handle both standard Google API and potential Composio variations
                event_id = event.get("id", "") or event.get("eventId", "")
@ -315,61 +316,24 @@ async def index_composio_google_calendar(

                if existing_document:
                    if existing_document.content_hash == content_hash:
+                        # Ensure status is ready (might have been stuck in processing/pending)
+                        if not DocumentStatus.is_state(existing_document.status, DocumentStatus.READY):
+                            existing_document.status = DocumentStatus.ready()
                        documents_skipped += 1
                        continue

-                    # Update existing
-                    user_llm = await get_user_long_context_llm(
-                        session, user_id, search_space_id
-                    )
-
-                    if user_llm:
-                        document_metadata = {
-                            "event_id": event_id,
-                            "summary": summary,
-                            "start_time": start_time,
-                            "document_type": "Google Calendar Event (Composio)",
-                        }
-                        (
-                            summary_content,
-                            summary_embedding,
-                        ) = await generate_document_summary(
-                            markdown_content, user_llm, document_metadata
-                        )
-                    else:
-                        summary_content = f"Calendar: {summary}\n\nStart: {start_time}\nEnd: {end_time}"
-                        if location:
-                            summary_content += f"\nLocation: {location}"
-                        summary_embedding = config.embedding_model_instance.embed(
-                            summary_content
-                        )
-
-                    chunks = await create_document_chunks(markdown_content)
-
-                    existing_document.title = summary
-                    existing_document.content = summary_content
-                    existing_document.content_hash = content_hash
-                    existing_document.embedding = summary_embedding
-                    existing_document.document_metadata = {
-                        "event_id": event_id,
-                        "summary": summary,
-                        "start_time": start_time,
-                        "end_time": end_time,
-                        "location": location,
-                        "connector_id": connector_id,
-                        "source": "composio",
-                    }
-                    existing_document.chunks = chunks
-                    existing_document.updated_at = get_current_timestamp()
-
-                    documents_indexed += 1
-
-                    # Batch commit every 10 documents
-                    if documents_indexed % 10 == 0:
-                        logger.info(
-                            f"Committing batch: {documents_indexed} Google Calendar events processed so far"
-                        )
-                        await session.commit()
+                    # Queue existing document for update (will be set to processing in Phase 2)
+                    events_to_process.append({
+                        'document': existing_document,
+                        'is_new': False,
+                        'markdown_content': markdown_content,
+                        'content_hash': content_hash,
+                        'event_id': event_id,
+                        'summary': summary,
+                        'start_time': start_time,
+                        'end_time': end_time,
+                        'location': location,
+                    })
                    continue

                # Document doesn't exist by unique_identifier_hash
@ -380,46 +344,16 @@ async def index_composio_google_calendar(
                    )

                if duplicate_by_content:
-                    # A document with the same content already exists (likely from standard connector)
                    logger.info(
                        f"Event {summary} already indexed by another connector "
                        f"(existing document ID: {duplicate_by_content.id}, "
-                        f"type: {duplicate_by_content.document_type}). Skipping to avoid duplicate content."
+                        f"type: {duplicate_by_content.document_type}). Skipping."
                    )
                    duplicate_content_count += 1
                    documents_skipped += 1
                    continue

-                # Create new document
-                user_llm = await get_user_long_context_llm(
-                    session, user_id, search_space_id
-                )
-
-                if user_llm:
-                    document_metadata = {
-                        "event_id": event_id,
-                        "summary": summary,
-                        "start_time": start_time,
-                        "document_type": "Google Calendar Event (Composio)",
-                    }
-                    (
-                        summary_content,
-                        summary_embedding,
-                    ) = await generate_document_summary(
-                        markdown_content, user_llm, document_metadata
-                    )
-                else:
-                    summary_content = (
-                        f"Calendar: {summary}\n\nStart: {start_time}\nEnd: {end_time}"
-                    )
-                    if location:
-                        summary_content += f"\nLocation: {location}"
-                    summary_embedding = config.embedding_model_instance.embed(
-                        summary_content
-                    )
-
-                chunks = await create_document_chunks(markdown_content)
-
+                # Create new document with PENDING status (visible in UI immediately)
                document = Document(
                    search_space_id=search_space_id,
                    title=summary,
@ -436,19 +370,107 @@ async def index_composio_google_calendar(
                        "toolkit_id": "googlecalendar",
                        "source": "composio",
                    },
-                    content=summary_content,
-                    content_hash=content_hash,
+                    content="Pending...",  # Placeholder until processed
+                    content_hash=unique_identifier_hash,  # Temporary unique value - updated when ready
                    unique_identifier_hash=unique_identifier_hash,
-                    embedding=summary_embedding,
-                    chunks=chunks,
+                    embedding=None,
+                    chunks=[],  # Empty at creation - safe for async
+                    status=DocumentStatus.pending(),  # Pending until processing starts
                    updated_at=get_current_timestamp(),
                    created_by_id=user_id,
                    connector_id=connector_id,
                )
                session.add(document)
+                new_documents_created = True
+
+                events_to_process.append({
+                    'document': document,
+                    'is_new': True,
+                    'markdown_content': markdown_content,
+                    'content_hash': content_hash,
+                    'event_id': event_id,
+                    'summary': summary,
+                    'start_time': start_time,
+                    'end_time': end_time,
+                    'location': location,
+                })
+
+            except Exception as e:
+                logger.error(f"Error in Phase 1 for event: {e!s}", exc_info=True)
+                documents_failed += 1
+                continue
+
+        # Commit all pending documents - they all appear in UI now
+        if new_documents_created:
+            logger.info(f"Phase 1: Committing {len([e for e in events_to_process if e['is_new']])} pending documents")
+            await session.commit()
+
+        # =======================================================================
+        # PHASE 2: Process each document one by one
+        # Each document transitions: pending → processing → ready/failed
+        # =======================================================================
+        logger.info(f"Phase 2: Processing {len(events_to_process)} documents")
+
+        for item in events_to_process:
+            # Send heartbeat periodically
+            if on_heartbeat_callback:
+                current_time = time.time()
+                if current_time - last_heartbeat_time >= HEARTBEAT_INTERVAL_SECONDS:
+                    await on_heartbeat_callback(documents_indexed)
+                    last_heartbeat_time = current_time
+
+            document = item['document']
+            try:
+                # Set to PROCESSING and commit - shows "processing" in UI for THIS document only
+                document.status = DocumentStatus.processing()
+                await session.commit()
+
+                # Heavy processing (LLM, embeddings, chunks)
+                user_llm = await get_user_long_context_llm(
+                    session, user_id, search_space_id
+                )
+
+                if user_llm:
+                    document_metadata_for_summary = {
+                        "event_id": item['event_id'],
+                        "summary": item['summary'],
+                        "start_time": item['start_time'],
+                        "document_type": "Google Calendar Event (Composio)",
+                    }
+                    summary_content, summary_embedding = await generate_document_summary(
+                        item['markdown_content'], user_llm, document_metadata_for_summary
+                    )
+                else:
+                    summary_content = f"Calendar: {item['summary']}\n\nStart: {item['start_time']}\nEnd: {item['end_time']}"
+                    if item['location']:
+                        summary_content += f"\nLocation: {item['location']}"
+                    summary_embedding = config.embedding_model_instance.embed(
+                        summary_content
+                    )
+
+                chunks = await create_document_chunks(item['markdown_content'])
+
+                # Update document to READY with actual content
+                document.title = item['summary']
+                document.content = summary_content
+                document.content_hash = item['content_hash']
+                document.embedding = summary_embedding
+                document.document_metadata = {
+                    "event_id": item['event_id'],
+                    "summary": item['summary'],
+                    "start_time": item['start_time'],
+                    "end_time": item['end_time'],
+                    "location": item['location'],
+                    "connector_id": connector_id,
+                    "source": "composio",
+                }
+                safe_set_chunks(document, chunks)
+                document.updated_at = get_current_timestamp()
+                document.status = DocumentStatus.ready()
+
                documents_indexed += 1

-                # Batch commit every 10 documents
+                # Batch commit every 10 documents (for ready status updates)
                if documents_indexed % 10 == 0:
                    logger.info(
                        f"Committing batch: {documents_indexed} Google Calendar events processed so far"
@ -457,7 +479,13 @@ async def index_composio_google_calendar(

            except Exception as e:
                logger.error(f"Error processing Calendar event: {e!s}", exc_info=True)
-                documents_skipped += 1
+                # Mark document as failed with reason (visible in UI)
+                try:
+                    document.status = DocumentStatus.failed(str(e))
+                    document.updated_at = get_current_timestamp()
+                except Exception as status_error:
+                    logger.error(f"Failed to update document status to failed: {status_error}")
+                documents_failed += 1
                continue

        # CRITICAL: Always update timestamp (even if 0 documents indexed) so Electric SQL syncs
@ -490,10 +518,13 @@ async def index_composio_google_calendar(
            else:
                raise

-        # Build warning message if duplicates were found
-        warning_message = None
+        # Build warning message if there were issues
+        warning_parts = []
        if duplicate_content_count > 0:
-            warning_message = f"{duplicate_content_count} skipped (duplicate)"
+            warning_parts.append(f"{duplicate_content_count} duplicate")
+        if documents_failed > 0:
+            warning_parts.append(f"{documents_failed} failed")
+        warning_message = ", ".join(warning_parts) if warning_parts else None

        await task_logger.log_task_success(
            log_entry,
@ -501,13 +532,15 @@ async def index_composio_google_calendar(
            {
                "documents_indexed": documents_indexed,
                "documents_skipped": documents_skipped,
+                "documents_failed": documents_failed,
                "duplicate_content_count": duplicate_content_count,
            },
        )

        logger.info(
-            f"Composio Google Calendar indexing completed: {documents_indexed} new events, {documents_skipped} skipped "
-            f"({duplicate_content_count} due to duplicate content from other connectors)"
+            f"Composio Google Calendar indexing completed: {documents_indexed} ready, "
+            f"{documents_skipped} skipped, {documents_failed} failed "
+            f"({duplicate_content_count} duplicate content)"
        )
        return documents_indexed, warning_message

--- a/surfsense_backend/app/db.py
+++ b/surfsense_backend/app/db.py
@ -100,6 +100,80 @@ class PodcastStatus(str, Enum):
    FAILED = "failed"


+class DocumentStatus:
+    """
+    Helper class for document processing status (stored as JSONB).
+    
+    Status values:
+    - {"state": "ready"} - Document is fully processed and searchable
+    - {"state": "pending"} - Document is queued, waiting to be processed
+    - {"state": "processing"} - Document is currently being processed (only 1 at a time)
+    - {"state": "failed", "reason": "..."} - Processing failed with reason
+    
+    Usage:
+        document.status = DocumentStatus.pending()
+        document.status = DocumentStatus.processing()
+        document.status = DocumentStatus.ready()
+        document.status = DocumentStatus.failed("LLM rate limit exceeded")
+    """
+    
+    # State constants
+    READY = "ready"
+    PENDING = "pending"
+    PROCESSING = "processing"
+    FAILED = "failed"
+    
+    @staticmethod
+    def ready() -> dict:
+        """Return status dict for a ready/searchable document."""
+        return {"state": DocumentStatus.READY}
+    
+    @staticmethod
+    def pending() -> dict:
+        """Return status dict for a document waiting to be processed."""
+        return {"state": DocumentStatus.PENDING}
+    
+    @staticmethod
+    def processing() -> dict:
+        """Return status dict for a document being processed."""
+        return {"state": DocumentStatus.PROCESSING}
+    
+    @staticmethod
+    def failed(reason: str, **extra_details) -> dict:
+        """
+        Return status dict for a failed document.
+        
+        Args:
+            reason: Human-readable failure reason
+            **extra_details: Optional additional details (duplicate_of, error_code, etc.)
+        """
+        status = {"state": DocumentStatus.FAILED, "reason": reason[:500]}  # Truncate long reasons
+        if extra_details:
+            status.update(extra_details)
+        return status
+    
+    @staticmethod
+    def get_state(status: dict | None) -> str | None:
+        """Extract state from status dict, returns None if invalid."""
+        if status is None:
+            return None
+        return status.get("state") if isinstance(status, dict) else None
+    
+    @staticmethod
+    def is_state(status: dict | None, state: str) -> bool:
+        """Check if status matches a given state."""
+        return DocumentStatus.get_state(status) == state
+    
+    @staticmethod
+    def get_failure_reason(status: dict | None) -> str | None:
+        """Extract failure reason from status dict."""
+        if status is None or not isinstance(status, dict):
+            return None
+        if status.get("state") == DocumentStatus.FAILED:
+            return status.get("reason")
+        return None
+
+
 class LiteLLMProvider(str, Enum):
    """
    Enum for LLM providers supported by LiteLLM.
@ -785,6 +859,17 @@ class Document(BaseModel, TimestampMixin):
        index=True,
    )

+    # Processing status for real-time visibility (JSONB)
+    # Format: {"state": "ready"} or {"state": "processing"} or {"state": "failed", "reason": "..."}
+    # Default to {"state": "ready"} for backward compatibility with existing documents
+    status = Column(
+        JSONB,
+        nullable=False,
+        default=DocumentStatus.ready,
+        server_default=text("'{\"state\": \"ready\"}'::jsonb"),
+        index=True,
+    )
+
    # Relationships
    search_space = relationship("SearchSpace", back_populates="documents")
    created_by = relationship("User", back_populates="documents")
--- a/surfsense_backend/app/routes/documents_routes.py
+++ b/surfsense_backend/app/routes/documents_routes.py
@ -19,6 +19,7 @@ from app.db import (
 from app.schemas import (
    DocumentRead,
    DocumentsCreate,
+    DocumentStatusSchema,
    DocumentTitleRead,
    DocumentTitleSearchResponse,
    DocumentUpdate,
@ -271,6 +272,14 @@ async def read_documents(
            if doc.created_by:
                created_by_name = doc.created_by.display_name or doc.created_by.email
            
+            # Parse status from JSONB
+            status_data = None
+            if hasattr(doc, 'status') and doc.status:
+                status_data = DocumentStatusSchema(
+                    state=doc.status.get("state", "ready"),
+                    reason=doc.status.get("reason"),
+                )
+            
            api_documents.append(
                DocumentRead(
                    id=doc.id,
@ -285,6 +294,7 @@ async def read_documents(
                    search_space_id=doc.search_space_id,
                    created_by_id=doc.created_by_id,
                    created_by_name=created_by_name,
+                    status=status_data,
                )
            )

@ -417,6 +427,14 @@ async def search_documents(
            if doc.created_by:
                created_by_name = doc.created_by.display_name or doc.created_by.email
            
+            # Parse status from JSONB
+            status_data = None
+            if hasattr(doc, 'status') and doc.status:
+                status_data = DocumentStatusSchema(
+                    state=doc.status.get("state", "ready"),
+                    reason=doc.status.get("reason"),
+                )
+            
            api_documents.append(
                DocumentRead(
                    id=doc.id,
@ -431,6 +449,7 @@ async def search_documents(
                    search_space_id=doc.search_space_id,
                    created_by_id=doc.created_by_id,
                    created_by_name=created_by_name,
+                    status=status_data,
                )
            )

@ -806,6 +825,7 @@ async def delete_document(
    """
    Delete a document.
    Requires DOCUMENTS_DELETE permission for the search space.
+    Documents in "processing" state cannot be deleted.
    """
    try:
        result = await session.execute(
@ -818,6 +838,14 @@ async def delete_document(
                status_code=404, detail=f"Document with id {document_id} not found"
            )

+        # Check if document is pending or currently being processed
+        doc_state = document.status.get("state") if document.status else None
+        if doc_state in ("pending", "processing"):
+            raise HTTPException(
+                status_code=409,  # Conflict
+                detail="Cannot delete document while it is pending or being processed. Please wait for processing to complete.",
+            )
+
        # Check permission for the search space
        await check_permission(
            session,
--- a/surfsense_backend/app/schemas/init.py
+++ b/surfsense_backend/app/schemas/init.py
@ -4,6 +4,7 @@ from .documents import (
    DocumentBase,
    DocumentRead,
    DocumentsCreate,
+    DocumentStatusSchema,
    DocumentTitleRead,
    DocumentTitleSearchResponse,
    DocumentUpdate,
@ -87,6 +88,7 @@ __all__ = [
    # Document schemas
    "DocumentBase",
    "DocumentRead",
+    "DocumentStatusSchema",
    "DocumentTitleRead",
    "DocumentTitleSearchResponse",
    "DocumentUpdate",
--- a/surfsense_backend/app/schemas/documents.py
+++ b/surfsense_backend/app/schemas/documents.py
@ -41,6 +41,12 @@ class DocumentUpdate(DocumentBase):
    pass


+class DocumentStatusSchema(BaseModel):
+    """Document processing status."""
+    state: str  # "ready", "processing", "failed"
+    reason: str | None = None
+
+
 class DocumentRead(BaseModel):
    id: int
    title: str
@ -54,6 +60,7 @@ class DocumentRead(BaseModel):
    search_space_id: int
    created_by_id: UUID | None = None  # User who created/uploaded this document
    created_by_name: str | None = None  # Display name or email of the user who created this document
+    status: DocumentStatusSchema | None = None  # Processing status (ready, processing, failed)

    model_config = ConfigDict(from_attributes=True)

--- a/surfsense_backend/app/tasks/connector_indexers/base.py
+++ b/surfsense_backend/app/tasks/connector_indexers/base.py
@ -28,6 +28,34 @@ def get_current_timestamp() -> datetime:
    return datetime.now(UTC)


+def safe_set_chunks(document: Document, chunks: list) -> None:
+    """
+    Safely assign chunks to a document without triggering lazy loading.
+    
+    ALWAYS use this instead of `document.chunks = chunks` to avoid
+    SQLAlchemy async errors (MissingGreenlet / greenlet_spawn).
+    
+    Why this is needed:
+    - Direct assignment `document.chunks = chunks` triggers SQLAlchemy to
+      load the OLD chunks first (for comparison/orphan detection)
+    - This lazy loading fails in async context with asyncpg driver
+    - set_committed_value bypasses this by setting the value directly
+    
+    This function is safe regardless of how the document was loaded
+    (with or without selectinload).
+    
+    Args:
+        document: The Document object to update
+        chunks: List of Chunk objects to assign
+    
+    Example:
+        # Instead of: document.chunks = chunks (DANGEROUS!)
+        safe_set_chunks(document, chunks)  # Always safe
+    """
+    from sqlalchemy.orm.attributes import set_committed_value
+    set_committed_value(document, 'chunks', chunks)
+
+
 async def check_duplicate_document_by_hash(
    session: AsyncSession, content_hash: str
 ) -> Document | None:
--- a/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/DocumentsTableShell.tsx
+++ b/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/DocumentsTableShell.tsx
@ -1,7 +1,7 @@
 "use client";

 import { formatDistanceToNow } from "date-fns";
-import { Calendar, ChevronDown, ChevronUp, FileText, FileX, Loader2, Network, Plus, User } from "lucide-react";
+import { AlertCircle, Calendar, CheckCircle2, ChevronDown, ChevronUp, Clock, FileText, FileX, Loader2, Network, Plus, User } from "lucide-react";
 import { motion } from "motion/react";
 import { useTranslations } from "next-intl";
 import React, { useRef, useState, useEffect, useCallback } from "react";
@ -17,6 +17,7 @@ import {
 	DialogTitle,
 } from "@/components/ui/dialog";
 import { Skeleton } from "@/components/ui/skeleton";
+import { Spinner } from "@/components/ui/spinner";
 import {
 	Table,
 	TableBody,
@ -29,7 +30,61 @@ import { Tooltip, TooltipContent, TooltipTrigger } from "@/components/ui/tooltip
 import { documentsApiService } from "@/lib/apis/documents-api.service";
 import { DocumentTypeChip } from "./DocumentTypeIcon";
 import { RowActions } from "./RowActions";
-import type { ColumnVisibility, Document } from "./types";
+import type { ColumnVisibility, Document, DocumentStatus } from "./types";
+
+// Status indicator component for document processing status
+function StatusIndicator({ status }: { status?: DocumentStatus }) {
+	const state = status?.state ?? "ready";
+	
+	switch (state) {
+		case "pending":
+			return (
+				<Tooltip>
+					<TooltipTrigger asChild>
+						<div className="flex items-center justify-center">
+							<Clock className="h-5 w-5 text-muted-foreground" />
+						</div>
+					</TooltipTrigger>
+					<TooltipContent side="top">Pending - waiting to be processed</TooltipContent>
+				</Tooltip>
+			);
+		case "processing":
+			return (
+				<Tooltip>
+					<TooltipTrigger asChild>
+						<div className="flex items-center justify-center">
+							<Spinner size="sm" className="text-primary" />
+						</div>
+					</TooltipTrigger>
+					<TooltipContent side="top">Processing...</TooltipContent>
+				</Tooltip>
+			);
+		case "failed":
+			return (
+				<Tooltip>
+					<TooltipTrigger asChild>
+						<div className="flex items-center justify-center">
+							<AlertCircle className="h-5 w-5 text-destructive" />
+						</div>
+					</TooltipTrigger>
+					<TooltipContent side="top" className="max-w-xs">
+						{status?.reason || "Processing failed"}
+					</TooltipContent>
+				</Tooltip>
+			);
+		case "ready":
+			return (
+				<Tooltip>
+					<TooltipTrigger asChild>
+						<div className="flex items-center justify-center">
+							<CheckCircle2 className="h-5 w-5 text-muted-foreground/60" />
+						</div>
+					</TooltipTrigger>
+					<TooltipContent side="top">Ready</TooltipContent>
+				</Tooltip>
+			);
+	}
+}

 export type SortKey = keyof Pick<Document, "title" | "document_type" | "created_at">;

@ -460,7 +515,7 @@ export function DocumentsTableShell({
 										</TableHead>
 									)}
 									{columnVisibility.created_at && (
-										<TableHead className="w-32">
+										<TableHead className="w-32 border-r border-border/40">
 											<SortableHeader
 												sortKey="created_at"
 												currentSortKey={sortKey}
@ -472,6 +527,13 @@ export function DocumentsTableShell({
 											</SortableHeader>
 										</TableHead>
 									)}
+									{columnVisibility.status && (
+										<TableHead className="w-20 text-center">
+											<span className="text-sm font-medium text-muted-foreground/70">
+												Status
+											</span>
+										</TableHead>
+									)}
 									<TableHead className="w-10">
 										<span className="sr-only">Actions</span>
 									</TableHead>
@ -552,7 +614,7 @@ export function DocumentsTableShell({
 													</TableCell>
 												)}
 												{columnVisibility.created_at && (
-													<TableCell className="w-32 py-2.5 text-sm text-foreground">
+													<TableCell className="w-32 py-2.5 text-sm text-foreground border-r border-border/40">
 														<Tooltip>
 															<TooltipTrigger asChild>
 																<span className="cursor-default">{formatRelativeDate(doc.created_at)}</span>
@ -563,6 +625,11 @@ export function DocumentsTableShell({
 														</Tooltip>
 													</TableCell>
 												)}
+												{columnVisibility.status && (
+													<TableCell className="w-20 py-2.5 text-center">
+														<StatusIndicator status={doc.status} />
+													</TableCell>
+												)}
 												<TableCell className="w-10 py-2.5 text-center">
 													<RowActions
 														document={doc}
@ -647,11 +714,14 @@ export function DocumentsTableShell({
 												)}
 											</div>
 										</div>
-										<RowActions
-											document={doc}
-											deleteDocument={deleteDocument}
-											searchSpaceId={searchSpaceId}
-										/>
+										<div className="flex items-center gap-2">
+											{columnVisibility.status && <StatusIndicator status={doc.status} />}
+											<RowActions
+												document={doc}
+												deleteDocument={deleteDocument}
+												searchSpaceId={searchSpaceId}
+											/>
+										</div>
 									</div>
 								</motion.div>
 							);
--- a/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/RowActions.tsx
+++ b/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/RowActions.tsx
@ -45,10 +45,17 @@ export function RowActions({
 		document.document_type as (typeof EDITABLE_DOCUMENT_TYPES)[number]
 	);

-	const isDeletable = !NON_DELETABLE_DOCUMENT_TYPES.includes(
+	// Documents in "pending" or "processing" state should show disabled delete
+	const isBeingProcessed = document.status?.state === "pending" || document.status?.state === "processing";
+
+	// SURFSENSE_DOCS are system-managed and should not show delete at all
+	const shouldShowDelete = !NON_DELETABLE_DOCUMENT_TYPES.includes(
 		document.document_type as (typeof NON_DELETABLE_DOCUMENT_TYPES)[number]
 	);

+	// Delete is disabled while processing
+	const isDeleteDisabled = isBeingProcessed;
+
 	const handleDelete = async () => {
 		setIsDeleting(true);
 		try {
@ -87,10 +94,11 @@ export function RowActions({
 								<Pencil className="mr-2 h-4 w-4" />
 								<span>Edit</span>
 							</DropdownMenuItem>
-							{isDeletable && (
+							{shouldShowDelete && (
 								<DropdownMenuItem
-									onClick={() => setIsDeleteOpen(true)}
-									className="text-destructive focus:text-destructive"
+									onClick={() => !isDeleteDisabled && setIsDeleteOpen(true)}
+									disabled={isDeleteDisabled}
+									className={isDeleteDisabled ? "text-muted-foreground cursor-not-allowed opacity-50" : "text-destructive focus:text-destructive"}
 								>
 									<Trash2 className="mr-2 h-4 w-4" />
 									<span>Delete</span>
@ -100,13 +108,13 @@ export function RowActions({
 					</DropdownMenu>
 				) : (
 					// Non-editable documents: show only delete button directly
-					isDeletable && (
+					shouldShowDelete && (
 						<Button
 							variant="ghost"
 							size="icon"
-							className="h-8 w-8 text-muted-foreground hover:text-destructive hover:bg-destructive/10"
-							onClick={() => setIsDeleteOpen(true)}
-							disabled={isDeleting}
+							className={`h-8 w-8 ${isDeleteDisabled ? "text-muted-foreground/50 cursor-not-allowed" : "text-muted-foreground hover:text-destructive hover:bg-destructive/10"}`}
+							onClick={() => !isDeleteDisabled && setIsDeleteOpen(true)}
+							disabled={isDeleting || isDeleteDisabled}
 						>
 							<Trash2 className="h-4 w-4" />
 							<span className="sr-only">Delete</span>
@ -131,10 +139,11 @@ export function RowActions({
 								<Pencil className="mr-2 h-4 w-4" />
 								<span>Edit</span>
 							</DropdownMenuItem>
-							{isDeletable && (
+							{shouldShowDelete && (
 								<DropdownMenuItem
-									onClick={() => setIsDeleteOpen(true)}
-									className="text-destructive focus:text-destructive"
+									onClick={() => !isDeleteDisabled && setIsDeleteOpen(true)}
+									disabled={isDeleteDisabled}
+									className={isDeleteDisabled ? "text-muted-foreground cursor-not-allowed opacity-50" : "text-destructive focus:text-destructive"}
 								>
 									<Trash2 className="mr-2 h-4 w-4" />
 									<span>Delete</span>
@ -144,13 +153,13 @@ export function RowActions({
 					</DropdownMenu>
 				) : (
 					// Non-editable documents: show only delete button directly
-					isDeletable && (
+					shouldShowDelete && (
 						<Button
 							variant="ghost"
 							size="icon"
-							className="h-8 w-8 text-muted-foreground hover:text-destructive hover:bg-destructive/10"
-							onClick={() => setIsDeleteOpen(true)}
-							disabled={isDeleting}
+							className={`h-8 w-8 ${isDeleteDisabled ? "text-muted-foreground/50 cursor-not-allowed" : "text-muted-foreground hover:text-destructive hover:bg-destructive/10"}`}
+							onClick={() => !isDeleteDisabled && setIsDeleteOpen(true)}
+							disabled={isDeleting || isDeleteDisabled}
 						>
 							<Trash2 className="h-4 w-4" />
 							<span className="sr-only">Delete</span>
--- a/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/types.ts
+++ b/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/types.ts
@ -1,5 +1,10 @@
 export type DocumentType = string;

+export type DocumentStatus = {
+	state: "ready" | "pending" | "processing" | "failed";
+	reason?: string;
+};
+
 export type Document = {
 	id: number;
 	title: string;
@ -11,10 +16,12 @@ export type Document = {
 	search_space_id: number;
 	created_by_id?: string | null;
 	created_by_name?: string | null;
+	status?: DocumentStatus;
 };

 export type ColumnVisibility = {
 	document_type: boolean;
 	created_by: boolean;
 	created_at: boolean;
+	status: boolean;
 };
--- a/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/page.tsx
+++ b/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/page.tsx
@ -38,6 +38,7 @@ export default function DocumentsTable() {
 		document_type: true,
 		created_by: true,
 		created_at: true,
+		status: true,
 	});
 	const [pageIndex, setPageIndex] = useState(0);
 	const [sortKey, setSortKey] = useState<SortKey>("created_at");
@ -115,6 +116,7 @@ export default function DocumentsTable() {
 				created_by_id: item.created_by_id ?? null,
 				created_by_name: item.created_by_name ?? null,
 				created_at: item.created_at,
+				status: (item as { status?: { state: "ready" | "pending" | "processing" | "failed"; reason?: string } }).status ?? { state: "ready" as const },
 			}))
 		: paginatedRealtimeDocuments;

@ -159,10 +161,35 @@ export default function DocumentsTable() {
 			toast.error(t("no_rows_selected"));
 			return;
 		}
+
+		// Filter out pending/processing documents - they cannot be deleted
+		// For real-time mode, use sortedRealtimeDocuments (which has status)
+		// For search mode, use searchResponse items (need to safely access status)
+		const allDocs = isSearchMode 
+			? (searchResponse?.items || []).map(item => ({
+				id: item.id,
+				status: (item as { status?: { state: string } }).status,
+			}))
+			: sortedRealtimeDocuments.map(doc => ({ id: doc.id, status: doc.status }));
+		
+		const selectedDocs = allDocs.filter((doc) => selectedIds.has(doc.id));
+		const deletableIds = selectedDocs
+			.filter((doc) => doc.status?.state !== "pending" && doc.status?.state !== "processing")
+			.map((doc) => doc.id);
+		const inProgressCount = selectedIds.size - deletableIds.length;
+
+		if (inProgressCount > 0) {
+			toast.warning(`${inProgressCount} document(s) are pending or processing and cannot be deleted.`);
+		}
+
+		if (deletableIds.length === 0) {
+			return;
+		}
+
 		try {
 			// Delete documents one by one using the mutation
 			const results = await Promise.all(
-				Array.from(selectedIds).map(async (id) => {
+				deletableIds.map(async (id) => {
 					try {
 						await deleteDocumentMutation({ id });
 						return true;
@ -172,7 +199,7 @@ export default function DocumentsTable() {
 				})
 			);
 			const okCount = results.filter((r) => r === true).length;
-			if (okCount === selectedIds.size)
+			if (okCount === deletableIds.length)
 				toast.success(t("delete_success_count", { count: okCount }));
 			else toast.error(t("delete_partial_failed"));
 			
--- a/surfsense_web/hooks/use-documents.ts
+++ b/surfsense_web/hooks/use-documents.ts
@ -9,6 +9,12 @@ import { useElectricClient } from "@/lib/electric/context";
 // Stable empty array to prevent infinite re-renders when no typeFilter is provided
 const EMPTY_TYPE_FILTER: DocumentTypeEnum[] = [];

+// Document status type (matches backend DocumentStatus JSONB)
+export interface DocumentStatusType {
+	state: "ready" | "pending" | "processing" | "failed";
+	reason?: string;
+}
+
 // Document from Electric sync (lightweight table columns - NO content/metadata)
 interface DocumentElectric {
 	id: number;
@ -17,6 +23,7 @@ interface DocumentElectric {
 	title: string;
 	created_by_id: string | null;
 	created_at: string;
+	status: DocumentStatusType | null;
 }

 // Document for display (with resolved user name)
@ -28,6 +35,7 @@ export interface DocumentDisplay {
 	created_by_id: string | null;
 	created_by_name: string | null;
 	created_at: string;
+	status: DocumentStatusType;
 }

 /**
@ -117,6 +125,7 @@ export function useDocuments(
 			created_by_id?: string | null;
 			created_by_name?: string | null;
 			created_at: string;
+			status?: DocumentStatusType | null;
 		}): DocumentDisplay => ({
 			id: item.id,
 			search_space_id: item.search_space_id,
@ -125,6 +134,7 @@ export function useDocuments(
 			created_by_id: item.created_by_id ?? null,
 			created_by_name: item.created_by_name ?? null,
 			created_at: item.created_at,
+			status: item.status ?? { state: "ready" },
 		}),
 		[]
 	);
@ -136,6 +146,7 @@ export function useDocuments(
 			created_by_name: doc.created_by_id
 				? userCacheRef.current.get(doc.created_by_id) ?? null
 				: null,
+			status: doc.status ?? { state: "ready" },
 		}),
 		[]
 	);
@ -221,7 +232,7 @@ export function useDocuments(
 				const handle = await client.syncShape({
 					table: "documents",
 					where: `search_space_id = ${spaceId}`,
-					columns: ["id", "document_type", "search_space_id", "title", "created_by_id", "created_at"],
+					columns: ["id", "document_type", "search_space_id", "title", "created_by_id", "created_at", "status"],
 					primaryKey: ["id"],
 				});

@ -259,7 +270,7 @@ export function useDocuments(
 					return;
 				}

-				let query = `SELECT id, document_type, search_space_id, title, created_by_id, created_at
+				let query = `SELECT id, document_type, search_space_id, title, created_by_id, created_at, status
 					FROM documents 
 					WHERE search_space_id = $1`;

--- a/surfsense_web/lib/electric/client.ts
+++ b/surfsense_web/lib/electric/client.ts
@ -72,7 +72,9 @@ const pendingSyncs = new Map<string, Promise<SyncHandle>>();
 //     - fixed getSyncCutoffDate to use stable midnight UTC timestamps
 // v6: real-time documents table - added title and created_by_id columns for live document display
 // v7: removed use-documents-electric.ts - consolidated to single documents sync to prevent conflicts
-const SYNC_VERSION = 7;
+// v8: added status column for real-time document processing status (ready/processing/failed)
+// v9: added pending state for accurate document queue visibility
+const SYNC_VERSION = 11;

 // Database name prefix for identifying SurfSense databases
 const DB_PREFIX = "surfsense-";
@ -245,12 +247,14 @@ export async function initElectric(userId: string): Promise<ElectricClient> {
 					document_type TEXT NOT NULL,
 					title TEXT NOT NULL DEFAULT '',
 					created_by_id TEXT,
-					created_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
+					created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
+					status JSONB DEFAULT '{"state": "ready"}'::jsonb
 				);
 				
 				CREATE INDEX IF NOT EXISTS idx_documents_search_space_id ON documents(search_space_id);
 				CREATE INDEX IF NOT EXISTS idx_documents_type ON documents(document_type);
 				CREATE INDEX IF NOT EXISTS idx_documents_search_space_type ON documents(search_space_id, document_type);
+				CREATE INDEX IF NOT EXISTS idx_documents_status ON documents((status->>'state'));
 			`);

 			await db.exec(`