Added local Speech-to-Text (STT) support using Faster-Whisper

This commit is contained in:
Nabhan 2025-10-11 23:56:12 +05:00
parent 402039f02f
commit dad79674c8
8 changed files with 396 additions and 7 deletions

View file

@ -0,0 +1,95 @@
"""Local Speech-to-Text service using Faster-Whisper."""
import os
import tempfile
from pathlib import Path
from typing import Optional
from faster_whisper import WhisperModel
from app.config import config
class STTService:
"""Local Speech-to-Text service using Faster-Whisper."""
def __init__(self, model_size: Optional[str] = None):
"""Initialize STT service with specified model size.
Args:
model_size: Whisper model size ("tiny", "base", "small", "medium", "large-v3")
"""
self.model_size = model_size or config.LOCAL_STT_MODEL
self._model: Optional[WhisperModel] = None
def _get_model(self) -> WhisperModel:
"""Lazy load the Whisper model."""
if self._model is None:
# Use CPU with optimizations for better performance
self._model = WhisperModel(
self.model_size,
device="cpu",
compute_type="int8", # Quantization for faster CPU inference
num_workers=1, # Single worker for stability
)
return self._model
def transcribe_file(self, audio_path: str, language: Optional[str] = None) -> dict:
"""Transcribe audio file to text.
Args:
audio_path: Path to audio file
language: Optional language code (e.g., "en", "es")
Returns:
Dict with transcription text and metadata
"""
model = self._get_model()
# Transcribe with optimized settings
segments, info = model.transcribe(
audio_path,
language=language,
beam_size=1, # Faster inference
best_of=1, # Single pass
temperature=0, # Deterministic output
vad_filter=True, # Voice activity detection
vad_parameters=dict(min_silence_duration_ms=500),
)
# Combine all segments
text = " ".join(segment.text.strip() for segment in segments)
return {
"text": text,
"language": info.language,
"language_probability": info.language_probability,
"duration": info.duration,
}
def transcribe_bytes(self, audio_bytes: bytes, filename: str = "audio.wav",
language: Optional[str] = None) -> dict:
"""Transcribe audio from bytes.
Args:
audio_bytes: Audio file bytes
filename: Original filename for format detection
language: Optional language code
Returns:
Dict with transcription text and metadata
"""
# Save bytes to temporary file
suffix = Path(filename).suffix or ".wav"
with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as tmp_file:
tmp_file.write(audio_bytes)
tmp_path = tmp_file.name
try:
return self.transcribe_file(tmp_path, language)
finally:
# Clean up temp file
os.unlink(tmp_path)
# Global STT service instance
stt_service = STTService()