Merge pull request #387 from nabthebest135/local-stt

local STT implementation with Faster-Whisper
2026-07-22 23:31:12 +02:00 · 2025-10-15 14:08:09 -07:00 · 2025-10-15 14:08:09 -07:00 · 5ed9aa2b0b
commit 5ed9aa2b0b
parent d868bae134 6f75ad159d
6 changed files with 2721 additions and 2511 deletions
--- a/surfsense_backend/.env.example
+++ b/surfsense_backend/.env.example
@ -31,12 +31,13 @@ TTS_SERVICE_API_KEY=
 # OPTIONAL: TTS Provider API Base
 TTS_SERVICE_API_BASE=

-# LiteLLM STT Provider: https://docs.litellm.ai/docs/audio_transcription#supported-providers
-STT_SERVICE=openai/whisper-1
-# Respective STT Service API
-STT_SERVICE_API_KEY=""
-# OPTIONAL: STT Provider API Base
-STT_SERVICE_API_BASE=
+# STT Service Configuration
+# For local Faster-Whisper: local/MODEL_SIZE (tiny, base, small, medium, large-v3)
+STT_SERVICE=local/base
+# For LiteLLM STT Provider: https://docs.litellm.ai/docs/audio_transcription#supported-providers
+# STT_SERVICE=openai/whisper-1
+# STT_SERVICE_API_KEY=""
+# STT_SERVICE_API_BASE=


 FIRECRAWL_API_KEY=fcr-01J0000000000000000000000
--- a/surfsense_backend/app/config/init.py
+++ b/surfsense_backend/app/config/init.py
@ -102,7 +102,7 @@ class Config:
    TTS_SERVICE_API_BASE = os.getenv("TTS_SERVICE_API_BASE")
    TTS_SERVICE_API_KEY = os.getenv("TTS_SERVICE_API_KEY")

-    # Litellm STT Configuration
+    # STT Configuration
    STT_SERVICE = os.getenv("STT_SERVICE")
    STT_SERVICE_API_BASE = os.getenv("STT_SERVICE_API_BASE")
    STT_SERVICE_API_KEY = os.getenv("STT_SERVICE_API_KEY")
--- a/surfsense_backend/app/routes/documents_routes.py
+++ b/surfsense_backend/app/routes/documents_routes.py
@ -784,25 +784,59 @@ async def process_file_in_background(
                {"file_type": "audio", "processing_stage": "starting_transcription"},
            )

-            # Open the audio file for transcription
-            with open(file_path, "rb") as audio_file:
+            # Determine STT service type
+            stt_service_type = "local" if app_config.STT_SERVICE and app_config.STT_SERVICE.startswith("local/") else "external"
+            
+            # Check if using local STT service
+            if stt_service_type == "local":
+                # Use local Faster-Whisper for transcription
+                from app.services.stt_service import stt_service
+                
+                try:
+                    result = stt_service.transcribe_file(file_path)
+                    transcribed_text = result.get("text", "")
+                    
+                    if not transcribed_text:
+                        raise ValueError("Transcription returned empty text")
+                    
+                    # Add metadata about the transcription
+                    transcribed_text = (
+                        f"# Transcription of {filename}\n\n{transcribed_text}"
+                    )
+                except Exception as e:
+                    raise HTTPException(
+                        status_code=422,
+                        detail=f"Failed to transcribe audio file {filename}: {str(e)}"
+                    ) from e
+                
+                await task_logger.log_task_progress(
+                    log_entry,
+                    f"Local STT transcription completed: {filename}",
+                    {
+                        "processing_stage": "local_transcription_complete",
+                        "language": result.get("language"),
+                        "confidence": result.get("language_probability"),
+                        "duration": result.get("duration"),
+                    },
+                )
+            else:
                # Use LiteLLM for audio transcription
-                if app_config.STT_SERVICE_API_BASE:
-                    transcription_response = await atranscription(
-                        model=app_config.STT_SERVICE,
-                        file=audio_file,
-                        api_base=app_config.STT_SERVICE_API_BASE,
-                        api_key=app_config.STT_SERVICE_API_KEY,
-                    )
-                else:
-                    transcription_response = await atranscription(
-                        model=app_config.STT_SERVICE,
-                        api_key=app_config.STT_SERVICE_API_KEY,
-                        file=audio_file,
-                    )
+                with open(file_path, "rb") as audio_file:
+                    transcription_kwargs = {
+                        "model": app_config.STT_SERVICE,
+                        "file": audio_file,
+                        "api_key": app_config.STT_SERVICE_API_KEY,
+                    }
+                    if app_config.STT_SERVICE_API_BASE:
+                        transcription_kwargs["api_base"] = app_config.STT_SERVICE_API_BASE
+                    
+                    transcription_response = await atranscription(**transcription_kwargs)

-                # Extract the transcribed text
-                transcribed_text = transcription_response.get("text", "")
+                    # Extract the transcribed text
+                    transcribed_text = transcription_response.get("text", "")
+                    
+                    if not transcribed_text:
+                        raise ValueError("Transcription returned empty text")

                # Add metadata about the transcription
                transcribed_text = (
@ -839,6 +873,7 @@ async def process_file_in_background(
                        "content_hash": result.content_hash,
                        "file_type": "audio",
                        "transcript_length": len(transcribed_text),
+                        "stt_service": stt_service_type,
                    },
                )
            else:
--- a/surfsense_backend/app/services/stt_service.py
+++ b/surfsense_backend/app/services/stt_service.py
@ -0,0 +1,96 @@
+"""Local Speech-to-Text service using Faster-Whisper."""
+
+import os
+import tempfile
+from pathlib import Path
+from typing import Optional
+
+from faster_whisper import WhisperModel
+from app.config import config
+
+
+class STTService:
+    """Local Speech-to-Text service using Faster-Whisper."""
+    
+    def __init__(self):
+        """Initialize STT service with model from STT_SERVICE config."""
+        # Parse model from STT_SERVICE (e.g., "local/base" or "local/tiny")
+        stt_service = config.STT_SERVICE or "local/base"
+        if stt_service.startswith("local/"):
+            self.model_size = stt_service.split("/", 1)[1]
+        else:
+            self.model_size = "base"  # fallback
+        self._model: Optional[WhisperModel] = None
+        
+    def _get_model(self) -> WhisperModel:
+        """Lazy load the Whisper model."""
+        if self._model is None:
+            # Use CPU with optimizations for better performance
+            self._model = WhisperModel(
+                self.model_size,
+                device="cpu",
+                compute_type="int8",  # Quantization for faster CPU inference
+                num_workers=1,  # Single worker for stability
+            )
+        return self._model
+    
+    def transcribe_file(self, audio_path: str, language: Optional[str] = None) -> dict:
+        """Transcribe audio file to text.
+        
+        Args:
+            audio_path: Path to audio file
+            language: Optional language code (e.g., "en", "es")
+            
+        Returns:
+            Dict with transcription text and metadata
+        """
+        model = self._get_model()
+        
+        # Transcribe with optimized settings
+        segments, info = model.transcribe(
+            audio_path,
+            language=language,
+            beam_size=1,  # Faster inference
+            best_of=1,    # Single pass
+            temperature=0,  # Deterministic output
+            vad_filter=True,  # Voice activity detection
+            vad_parameters=dict(min_silence_duration_ms=500),
+        )
+        
+        # Combine all segments
+        text = " ".join(segment.text.strip() for segment in segments)
+        
+        return {
+            "text": text,
+            "language": info.language,
+            "language_probability": info.language_probability,
+            "duration": info.duration,
+        }
+    
+    def transcribe_bytes(self, audio_bytes: bytes, filename: str = "audio.wav", 
+                        language: Optional[str] = None) -> dict:
+        """Transcribe audio from bytes.
+        
+        Args:
+            audio_bytes: Audio file bytes
+            filename: Original filename for format detection
+            language: Optional language code
+            
+        Returns:
+            Dict with transcription text and metadata
+        """
+        # Save bytes to temporary file
+        suffix = Path(filename).suffix or ".wav"
+        with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as tmp_file:
+            tmp_file.write(audio_bytes)
+            tmp_path = tmp_file.name
+        
+        try:
+            return self.transcribe_file(tmp_path, language)
+        finally:
+            # Clean up temp file
+            os.unlink(tmp_path)
+
+
+# Global STT service instance
+stt_service = STTService()
--- a/surfsense_backend/pyproject.toml
+++ b/surfsense_backend/pyproject.toml
@ -43,6 +43,7 @@ dependencies = [
    "youtube-transcript-api>=1.0.3",
    "litellm>=1.77.5",
    "langchain-litellm>=0.2.3",
+    "faster-whisper>=1.1.0",
 ]

 [dependency-groups]
--- a/surfsense_backend/uv.lock
+++ b/surfsense_backend/uv.lock