feat: Enhance document processing notifications and refactor related services

- Introduced a new DocumentProcessingNotificationHandler to manage notifications for document processing stages.
- Updated existing notification methods to include detailed progress updates for various stages (queued, parsing, chunking, embedding, storing, completed, failed).
- Refactored NotificationService to support the new document processing notification type and metadata schema.
- Updated multiple document processing tasks to create and manage notifications throughout the processing lifecycle.
- Adjusted UI components to reflect changes in notification types and improve user experience during document uploads and processing.
This commit is contained in:
Anish Sarkar 2026-01-13 19:09:12 +05:30
parent 59a8ef5d64
commit 12671ede0e
7 changed files with 534 additions and 79 deletions

View file

@ -14,8 +14,9 @@ from sqlalchemy.exc import SQLAlchemyError
from sqlalchemy.ext.asyncio import AsyncSession
from app.config import config as app_config
from app.db import Document, DocumentType, Log
from app.db import Document, DocumentType, Log, Notification
from app.services.llm_service import get_user_long_context_llm
from app.services.notification_service import NotificationService
from app.services.task_logging_service import TaskLoggingService
from app.utils.document_converters import (
convert_document_to_markdown,
@ -475,10 +476,17 @@ async def process_file_in_background(
log_entry: Log,
connector: dict
| None = None, # Optional: {"type": "GOOGLE_DRIVE_FILE", "metadata": {...}}
):
notification: Notification | None = None, # Optional notification for progress updates
) -> Document | None:
try:
# Check if the file is a markdown or text file
if filename.lower().endswith((".md", ".markdown", ".txt")):
# Update notification: parsing stage
if notification:
await NotificationService.document_processing.notify_processing_progress(
session, notification, stage="parsing", stage_message="Reading file"
)
await task_logger.log_task_progress(
log_entry,
f"Processing markdown/text file: {filename}",
@ -498,6 +506,12 @@ async def process_file_in_background(
print("Error deleting temp file", e)
pass
# Update notification: chunking stage
if notification:
await NotificationService.document_processing.notify_processing_progress(
session, notification, stage="chunking"
)
await task_logger.log_task_progress(
log_entry,
f"Creating document from markdown content: {filename}",
@ -525,17 +539,25 @@ async def process_file_in_background(
"file_type": "markdown",
},
)
return result
else:
await task_logger.log_task_success(
log_entry,
f"Markdown file already exists (duplicate): {filename}",
{"duplicate_detected": True, "file_type": "markdown"},
)
return None
# Check if the file is an audio file
elif filename.lower().endswith(
(".mp3", ".mp4", ".mpeg", ".mpga", ".m4a", ".wav", ".webm")
):
# Update notification: parsing stage (transcription)
if notification:
await NotificationService.document_processing.notify_processing_progress(
session, notification, stage="parsing", stage_message="Transcribing audio"
)
await task_logger.log_task_progress(
log_entry,
f"Processing audio file for transcription: {filename}",
@ -619,6 +641,12 @@ async def process_file_in_background(
},
)
# Update notification: chunking stage
if notification:
await NotificationService.document_processing.notify_processing_progress(
session, notification, stage="chunking"
)
# Clean up the temp file
try:
os.unlink(file_path)
@ -646,12 +674,14 @@ async def process_file_in_background(
"stt_service": stt_service_type,
},
)
return result
else:
await task_logger.log_task_success(
log_entry,
f"Audio file transcript already exists (duplicate): {filename}",
{"duplicate_detected": True, "file_type": "audio"},
)
return None
else:
# Import page limit service
@ -716,6 +746,12 @@ async def process_file_in_background(
) from e
if app_config.ETL_SERVICE == "UNSTRUCTURED":
# Update notification: parsing stage
if notification:
await NotificationService.document_processing.notify_processing_progress(
session, notification, stage="parsing", stage_message="Extracting content"
)
await task_logger.log_task_progress(
log_entry,
f"Processing file with Unstructured ETL: {filename}",
@ -741,6 +777,12 @@ async def process_file_in_background(
docs = await loader.aload()
# Update notification: chunking stage
if notification:
await NotificationService.document_processing.notify_processing_progress(
session, notification, stage="chunking", chunks_count=len(docs)
)
await task_logger.log_task_progress(
log_entry,
f"Unstructured ETL completed, creating document: {filename}",
@ -800,6 +842,7 @@ async def process_file_in_background(
"pages_processed": final_page_count,
},
)
return result
else:
await task_logger.log_task_success(
log_entry,
@ -810,8 +853,15 @@ async def process_file_in_background(
"etl_service": "UNSTRUCTURED",
},
)
return None
elif app_config.ETL_SERVICE == "LLAMACLOUD":
# Update notification: parsing stage
if notification:
await NotificationService.document_processing.notify_processing_progress(
session, notification, stage="parsing", stage_message="Extracting content"
)
await task_logger.log_task_progress(
log_entry,
f"Processing file with LlamaCloud ETL: {filename}",
@ -851,6 +901,12 @@ async def process_file_in_background(
split_by_page=False
)
# Update notification: chunking stage
if notification:
await NotificationService.document_processing.notify_processing_progress(
session, notification, stage="chunking", chunks_count=len(markdown_documents)
)
await task_logger.log_task_progress(
log_entry,
f"LlamaCloud parsing completed, creating documents: {filename}",
@ -943,6 +999,7 @@ async def process_file_in_background(
"documents_count": len(markdown_documents),
},
)
return last_created_doc
else:
# All documents were duplicates (markdown_documents was not empty, but all returned None)
await task_logger.log_task_success(
@ -955,8 +1012,15 @@ async def process_file_in_background(
"documents_count": len(markdown_documents),
},
)
return None
elif app_config.ETL_SERVICE == "DOCLING":
# Update notification: parsing stage
if notification:
await NotificationService.document_processing.notify_processing_progress(
session, notification, stage="parsing", stage_message="Extracting content"
)
await task_logger.log_task_progress(
log_entry,
f"Processing file with Docling ETL: {filename}",
@ -1039,6 +1103,12 @@ async def process_file_in_background(
},
)
# Update notification: chunking stage
if notification:
await NotificationService.document_processing.notify_processing_progress(
session, notification, stage="chunking"
)
# Process the document using our Docling background task
doc_result = await add_received_file_document_using_docling(
session,
@ -1071,6 +1141,7 @@ async def process_file_in_background(
"pages_processed": final_page_count,
},
)
return doc_result
else:
await task_logger.log_task_success(
log_entry,
@ -1081,6 +1152,7 @@ async def process_file_in_background(
"etl_service": "DOCLING",
},
)
return None
except Exception as e:
await session.rollback()