fix: add defensive dictionary access and error handling for local STT

- Use .get() for safe dictionary access instead of direct key access - Add explicit try-catch for local STT transcription failures - Validate transcription result is not empty - Provide clear error messages for corrupted audio files - Match error handling pattern with external STT service
2026-04-25 16:56:22 +02:00 · 2025-10-12 11:14:12 +05:00 · 2025-10-12 11:14:12 +05:00 · 15ba2b86f6
commit 15ba2b86f6
parent 504399ad01
1 changed files with 14 additions and 5 deletions
--- a/surfsense_backend/app/routes/documents_routes.py
+++ b/surfsense_backend/app/routes/documents_routes.py
@ -792,17 +792,26 @@ async def process_file_in_background(
                # Use local Faster-Whisper for transcription
                from app.services.stt_service import stt_service
                
-                result = stt_service.transcribe_file(file_path)
-                transcribed_text = result["text"]
+                try:
+                    result = stt_service.transcribe_file(file_path)
+                    transcribed_text = result.get("text", "")
+                    
+                    if not transcribed_text:
+                        raise ValueError("Transcription returned empty text")
+                except Exception as e:
+                    raise HTTPException(
+                        status_code=422,
+                        detail=f"Failed to transcribe audio file {filename}: {str(e)}"
+                    ) from e
                
                await task_logger.log_task_progress(
                    log_entry,
                    f"Local STT transcription completed: {filename}",
                    {
                        "processing_stage": "local_transcription_complete",
-                        "language": result["language"],
-                        "confidence": result["language_probability"],
-                        "duration": result["duration"],
+                        "language": result.get("language"),
+                        "confidence": result.get("language_probability"),
+                        "duration": result.get("duration"),
                    },
                )
            else: