mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-04-26 17:26:23 +02:00
feat: Enhance document processing notifications and refactor related services
- Introduced a new DocumentProcessingNotificationHandler to manage notifications for document processing stages. - Updated existing notification methods to include detailed progress updates for various stages (queued, parsing, chunking, embedding, storing, completed, failed). - Refactored NotificationService to support the new document processing notification type and metadata schema. - Updated multiple document processing tasks to create and manage notifications throughout the processing lifecycle. - Adjusted UI components to reflect changes in notification types and improve user experience during document uploads and processing.
This commit is contained in:
parent
59a8ef5d64
commit
12671ede0e
7 changed files with 534 additions and 79 deletions
|
|
@ -14,8 +14,9 @@ from sqlalchemy.exc import SQLAlchemyError
|
|||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
|
||||
from app.config import config as app_config
|
||||
from app.db import Document, DocumentType, Log
|
||||
from app.db import Document, DocumentType, Log, Notification
|
||||
from app.services.llm_service import get_user_long_context_llm
|
||||
from app.services.notification_service import NotificationService
|
||||
from app.services.task_logging_service import TaskLoggingService
|
||||
from app.utils.document_converters import (
|
||||
convert_document_to_markdown,
|
||||
|
|
@ -475,10 +476,17 @@ async def process_file_in_background(
|
|||
log_entry: Log,
|
||||
connector: dict
|
||||
| None = None, # Optional: {"type": "GOOGLE_DRIVE_FILE", "metadata": {...}}
|
||||
):
|
||||
notification: Notification | None = None, # Optional notification for progress updates
|
||||
) -> Document | None:
|
||||
try:
|
||||
# Check if the file is a markdown or text file
|
||||
if filename.lower().endswith((".md", ".markdown", ".txt")):
|
||||
# Update notification: parsing stage
|
||||
if notification:
|
||||
await NotificationService.document_processing.notify_processing_progress(
|
||||
session, notification, stage="parsing", stage_message="Reading file"
|
||||
)
|
||||
|
||||
await task_logger.log_task_progress(
|
||||
log_entry,
|
||||
f"Processing markdown/text file: {filename}",
|
||||
|
|
@ -498,6 +506,12 @@ async def process_file_in_background(
|
|||
print("Error deleting temp file", e)
|
||||
pass
|
||||
|
||||
# Update notification: chunking stage
|
||||
if notification:
|
||||
await NotificationService.document_processing.notify_processing_progress(
|
||||
session, notification, stage="chunking"
|
||||
)
|
||||
|
||||
await task_logger.log_task_progress(
|
||||
log_entry,
|
||||
f"Creating document from markdown content: {filename}",
|
||||
|
|
@ -525,17 +539,25 @@ async def process_file_in_background(
|
|||
"file_type": "markdown",
|
||||
},
|
||||
)
|
||||
return result
|
||||
else:
|
||||
await task_logger.log_task_success(
|
||||
log_entry,
|
||||
f"Markdown file already exists (duplicate): {filename}",
|
||||
{"duplicate_detected": True, "file_type": "markdown"},
|
||||
)
|
||||
return None
|
||||
|
||||
# Check if the file is an audio file
|
||||
elif filename.lower().endswith(
|
||||
(".mp3", ".mp4", ".mpeg", ".mpga", ".m4a", ".wav", ".webm")
|
||||
):
|
||||
# Update notification: parsing stage (transcription)
|
||||
if notification:
|
||||
await NotificationService.document_processing.notify_processing_progress(
|
||||
session, notification, stage="parsing", stage_message="Transcribing audio"
|
||||
)
|
||||
|
||||
await task_logger.log_task_progress(
|
||||
log_entry,
|
||||
f"Processing audio file for transcription: {filename}",
|
||||
|
|
@ -619,6 +641,12 @@ async def process_file_in_background(
|
|||
},
|
||||
)
|
||||
|
||||
# Update notification: chunking stage
|
||||
if notification:
|
||||
await NotificationService.document_processing.notify_processing_progress(
|
||||
session, notification, stage="chunking"
|
||||
)
|
||||
|
||||
# Clean up the temp file
|
||||
try:
|
||||
os.unlink(file_path)
|
||||
|
|
@ -646,12 +674,14 @@ async def process_file_in_background(
|
|||
"stt_service": stt_service_type,
|
||||
},
|
||||
)
|
||||
return result
|
||||
else:
|
||||
await task_logger.log_task_success(
|
||||
log_entry,
|
||||
f"Audio file transcript already exists (duplicate): {filename}",
|
||||
{"duplicate_detected": True, "file_type": "audio"},
|
||||
)
|
||||
return None
|
||||
|
||||
else:
|
||||
# Import page limit service
|
||||
|
|
@ -716,6 +746,12 @@ async def process_file_in_background(
|
|||
) from e
|
||||
|
||||
if app_config.ETL_SERVICE == "UNSTRUCTURED":
|
||||
# Update notification: parsing stage
|
||||
if notification:
|
||||
await NotificationService.document_processing.notify_processing_progress(
|
||||
session, notification, stage="parsing", stage_message="Extracting content"
|
||||
)
|
||||
|
||||
await task_logger.log_task_progress(
|
||||
log_entry,
|
||||
f"Processing file with Unstructured ETL: {filename}",
|
||||
|
|
@ -741,6 +777,12 @@ async def process_file_in_background(
|
|||
|
||||
docs = await loader.aload()
|
||||
|
||||
# Update notification: chunking stage
|
||||
if notification:
|
||||
await NotificationService.document_processing.notify_processing_progress(
|
||||
session, notification, stage="chunking", chunks_count=len(docs)
|
||||
)
|
||||
|
||||
await task_logger.log_task_progress(
|
||||
log_entry,
|
||||
f"Unstructured ETL completed, creating document: {filename}",
|
||||
|
|
@ -800,6 +842,7 @@ async def process_file_in_background(
|
|||
"pages_processed": final_page_count,
|
||||
},
|
||||
)
|
||||
return result
|
||||
else:
|
||||
await task_logger.log_task_success(
|
||||
log_entry,
|
||||
|
|
@ -810,8 +853,15 @@ async def process_file_in_background(
|
|||
"etl_service": "UNSTRUCTURED",
|
||||
},
|
||||
)
|
||||
return None
|
||||
|
||||
elif app_config.ETL_SERVICE == "LLAMACLOUD":
|
||||
# Update notification: parsing stage
|
||||
if notification:
|
||||
await NotificationService.document_processing.notify_processing_progress(
|
||||
session, notification, stage="parsing", stage_message="Extracting content"
|
||||
)
|
||||
|
||||
await task_logger.log_task_progress(
|
||||
log_entry,
|
||||
f"Processing file with LlamaCloud ETL: {filename}",
|
||||
|
|
@ -851,6 +901,12 @@ async def process_file_in_background(
|
|||
split_by_page=False
|
||||
)
|
||||
|
||||
# Update notification: chunking stage
|
||||
if notification:
|
||||
await NotificationService.document_processing.notify_processing_progress(
|
||||
session, notification, stage="chunking", chunks_count=len(markdown_documents)
|
||||
)
|
||||
|
||||
await task_logger.log_task_progress(
|
||||
log_entry,
|
||||
f"LlamaCloud parsing completed, creating documents: {filename}",
|
||||
|
|
@ -943,6 +999,7 @@ async def process_file_in_background(
|
|||
"documents_count": len(markdown_documents),
|
||||
},
|
||||
)
|
||||
return last_created_doc
|
||||
else:
|
||||
# All documents were duplicates (markdown_documents was not empty, but all returned None)
|
||||
await task_logger.log_task_success(
|
||||
|
|
@ -955,8 +1012,15 @@ async def process_file_in_background(
|
|||
"documents_count": len(markdown_documents),
|
||||
},
|
||||
)
|
||||
return None
|
||||
|
||||
elif app_config.ETL_SERVICE == "DOCLING":
|
||||
# Update notification: parsing stage
|
||||
if notification:
|
||||
await NotificationService.document_processing.notify_processing_progress(
|
||||
session, notification, stage="parsing", stage_message="Extracting content"
|
||||
)
|
||||
|
||||
await task_logger.log_task_progress(
|
||||
log_entry,
|
||||
f"Processing file with Docling ETL: {filename}",
|
||||
|
|
@ -1039,6 +1103,12 @@ async def process_file_in_background(
|
|||
},
|
||||
)
|
||||
|
||||
# Update notification: chunking stage
|
||||
if notification:
|
||||
await NotificationService.document_processing.notify_processing_progress(
|
||||
session, notification, stage="chunking"
|
||||
)
|
||||
|
||||
# Process the document using our Docling background task
|
||||
doc_result = await add_received_file_document_using_docling(
|
||||
session,
|
||||
|
|
@ -1071,6 +1141,7 @@ async def process_file_in_background(
|
|||
"pages_processed": final_page_count,
|
||||
},
|
||||
)
|
||||
return doc_result
|
||||
else:
|
||||
await task_logger.log_task_success(
|
||||
log_entry,
|
||||
|
|
@ -1081,6 +1152,7 @@ async def process_file_in_background(
|
|||
"etl_service": "DOCLING",
|
||||
},
|
||||
)
|
||||
return None
|
||||
except Exception as e:
|
||||
await session.rollback()
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue