feat: Enhance document processing notifications and refactor related services

- Introduced a new DocumentProcessingNotificationHandler to manage notifications for document processing stages. - Updated existing notification methods to include detailed progress updates for various stages (queued, parsing, chunking, embedding, storing, completed, failed). - Refactored NotificationService to support the new document processing notification type and metadata schema. - Updated multiple document processing tasks to create and manage notifications throughout the processing lifecycle. - Adjusted UI components to reflect changes in notification types and improve user experience during document uploads and processing.
2026-04-26 17:26:23 +02:00 · 2026-01-13 19:09:12 +05:30 · 2026-01-13 19:09:12 +05:30 · 12671ede0e
commit 12671ede0e
parent 59a8ef5d64
7 changed files with 534 additions and 79 deletions
--- a/surfsense_backend/app/tasks/document_processors/file_processors.py
+++ b/surfsense_backend/app/tasks/document_processors/file_processors.py
@ -14,8 +14,9 @@ from sqlalchemy.exc import SQLAlchemyError
 from sqlalchemy.ext.asyncio import AsyncSession

 from app.config import config as app_config
-from app.db import Document, DocumentType, Log
+from app.db import Document, DocumentType, Log, Notification
 from app.services.llm_service import get_user_long_context_llm
+from app.services.notification_service import NotificationService
 from app.services.task_logging_service import TaskLoggingService
 from app.utils.document_converters import (
    convert_document_to_markdown,
@ -475,10 +476,17 @@ async def process_file_in_background(
    log_entry: Log,
    connector: dict
    | None = None,  # Optional: {"type": "GOOGLE_DRIVE_FILE", "metadata": {...}}
-):
+    notification: Notification | None = None,  # Optional notification for progress updates
+) -> Document | None:
    try:
        # Check if the file is a markdown or text file
        if filename.lower().endswith((".md", ".markdown", ".txt")):
+            # Update notification: parsing stage
+            if notification:
+                await NotificationService.document_processing.notify_processing_progress(
+                    session, notification, stage="parsing", stage_message="Reading file"
+                )
+
            await task_logger.log_task_progress(
                log_entry,
                f"Processing markdown/text file: {filename}",
@ -498,6 +506,12 @@ async def process_file_in_background(
                print("Error deleting temp file", e)
                pass

+            # Update notification: chunking stage
+            if notification:
+                await NotificationService.document_processing.notify_processing_progress(
+                    session, notification, stage="chunking"
+                )
+
            await task_logger.log_task_progress(
                log_entry,
                f"Creating document from markdown content: {filename}",
@ -525,17 +539,25 @@ async def process_file_in_background(
                        "file_type": "markdown",
                    },
                )
+                return result
            else:
                await task_logger.log_task_success(
                    log_entry,
                    f"Markdown file already exists (duplicate): {filename}",
                    {"duplicate_detected": True, "file_type": "markdown"},
                )
+                return None

        # Check if the file is an audio file
        elif filename.lower().endswith(
            (".mp3", ".mp4", ".mpeg", ".mpga", ".m4a", ".wav", ".webm")
        ):
+            # Update notification: parsing stage (transcription)
+            if notification:
+                await NotificationService.document_processing.notify_processing_progress(
+                    session, notification, stage="parsing", stage_message="Transcribing audio"
+                )
+
            await task_logger.log_task_progress(
                log_entry,
                f"Processing audio file for transcription: {filename}",
@ -619,6 +641,12 @@ async def process_file_in_background(
                },
            )

+            # Update notification: chunking stage
+            if notification:
+                await NotificationService.document_processing.notify_processing_progress(
+                    session, notification, stage="chunking"
+                )
+
            # Clean up the temp file
            try:
                os.unlink(file_path)
@ -646,12 +674,14 @@ async def process_file_in_background(
                        "stt_service": stt_service_type,
                    },
                )
+                return result
            else:
                await task_logger.log_task_success(
                    log_entry,
                    f"Audio file transcript already exists (duplicate): {filename}",
                    {"duplicate_detected": True, "file_type": "audio"},
                )
+                return None

        else:
            # Import page limit service
@ -716,6 +746,12 @@ async def process_file_in_background(
                ) from e

            if app_config.ETL_SERVICE == "UNSTRUCTURED":
+                # Update notification: parsing stage
+                if notification:
+                    await NotificationService.document_processing.notify_processing_progress(
+                        session, notification, stage="parsing", stage_message="Extracting content"
+                    )
+
                await task_logger.log_task_progress(
                    log_entry,
                    f"Processing file with Unstructured ETL: {filename}",
@ -741,6 +777,12 @@ async def process_file_in_background(

                docs = await loader.aload()

+                # Update notification: chunking stage
+                if notification:
+                    await NotificationService.document_processing.notify_processing_progress(
+                        session, notification, stage="chunking", chunks_count=len(docs)
+                    )
+
                await task_logger.log_task_progress(
                    log_entry,
                    f"Unstructured ETL completed, creating document: {filename}",
@ -800,6 +842,7 @@ async def process_file_in_background(
                            "pages_processed": final_page_count,
                        },
                    )
+                    return result
                else:
                    await task_logger.log_task_success(
                        log_entry,
@ -810,8 +853,15 @@ async def process_file_in_background(
                            "etl_service": "UNSTRUCTURED",
                        },
                    )
+                    return None

            elif app_config.ETL_SERVICE == "LLAMACLOUD":
+                # Update notification: parsing stage
+                if notification:
+                    await NotificationService.document_processing.notify_processing_progress(
+                        session, notification, stage="parsing", stage_message="Extracting content"
+                    )
+
                await task_logger.log_task_progress(
                    log_entry,
                    f"Processing file with LlamaCloud ETL: {filename}",
@ -851,6 +901,12 @@ async def process_file_in_background(
                    split_by_page=False
                )

+                # Update notification: chunking stage
+                if notification:
+                    await NotificationService.document_processing.notify_processing_progress(
+                        session, notification, stage="chunking", chunks_count=len(markdown_documents)
+                    )
+
                await task_logger.log_task_progress(
                    log_entry,
                    f"LlamaCloud parsing completed, creating documents: {filename}",
@ -943,6 +999,7 @@ async def process_file_in_background(
                            "documents_count": len(markdown_documents),
                        },
                    )
+                    return last_created_doc
                else:
                    # All documents were duplicates (markdown_documents was not empty, but all returned None)
                    await task_logger.log_task_success(
@ -955,8 +1012,15 @@ async def process_file_in_background(
                            "documents_count": len(markdown_documents),
                        },
                    )
+                    return None

            elif app_config.ETL_SERVICE == "DOCLING":
+                # Update notification: parsing stage
+                if notification:
+                    await NotificationService.document_processing.notify_processing_progress(
+                        session, notification, stage="parsing", stage_message="Extracting content"
+                    )
+
                await task_logger.log_task_progress(
                    log_entry,
                    f"Processing file with Docling ETL: {filename}",
@ -1039,6 +1103,12 @@ async def process_file_in_background(
                        },
                    )

+                # Update notification: chunking stage
+                if notification:
+                    await NotificationService.document_processing.notify_processing_progress(
+                        session, notification, stage="chunking"
+                    )
+
                # Process the document using our Docling background task
                doc_result = await add_received_file_document_using_docling(
                    session,
@ -1071,6 +1141,7 @@ async def process_file_in_background(
                            "pages_processed": final_page_count,
                        },
                    )
+                    return doc_result
                else:
                    await task_logger.log_task_success(
                        log_entry,
@ -1081,6 +1152,7 @@ async def process_file_in_background(
                            "etl_service": "DOCLING",
                        },
                    )
+                    return None
    except Exception as e:
        await session.rollback()