diff --git a/surfsense_backend/.env.example b/surfsense_backend/.env.example index 75bd4a368..ad6c9d0c9 100644 --- a/surfsense_backend/.env.example +++ b/surfsense_backend/.env.example @@ -32,10 +32,8 @@ TTS_SERVICE_API_KEY= TTS_SERVICE_API_BASE= # STT Service Configuration -# Use 'local' for offline Faster-Whisper or LiteLLM provider -STT_SERVICE=local -# For local STT: Whisper model size (tiny, base, small, medium, large-v3) -LOCAL_STT_MODEL=base +# For local Faster-Whisper: local/MODEL_SIZE (tiny, base, small, medium, large-v3) +STT_SERVICE=local/base # For LiteLLM STT Provider: https://docs.litellm.ai/docs/audio_transcription#supported-providers # STT_SERVICE=openai/whisper-1 # STT_SERVICE_API_KEY="" diff --git a/surfsense_backend/app/config/__init__.py b/surfsense_backend/app/config/__init__.py index 374c1e704..acd1017e4 100644 --- a/surfsense_backend/app/config/__init__.py +++ b/surfsense_backend/app/config/__init__.py @@ -106,9 +106,6 @@ class Config: STT_SERVICE = os.getenv("STT_SERVICE") STT_SERVICE_API_BASE = os.getenv("STT_SERVICE_API_BASE") STT_SERVICE_API_KEY = os.getenv("STT_SERVICE_API_KEY") - - # Local STT Configuration - LOCAL_STT_MODEL = os.getenv("LOCAL_STT_MODEL", "base") # Validation Checks # Check embedding dimension diff --git a/surfsense_backend/app/routes/__init__.py b/surfsense_backend/app/routes/__init__.py index c1eed6478..1c7e3505f 100644 --- a/surfsense_backend/app/routes/__init__.py +++ b/surfsense_backend/app/routes/__init__.py @@ -17,7 +17,6 @@ from .luma_add_connector_route import router as luma_add_connector_router from .podcasts_routes import router as podcasts_router from .search_source_connectors_routes import router as search_source_connectors_router from .search_spaces_routes import router as search_spaces_router -from .stt_routes import router as stt_router router = APIRouter() @@ -32,4 +31,3 @@ router.include_router(airtable_add_connector_router) router.include_router(luma_add_connector_router) router.include_router(llm_config_router) router.include_router(logs_router) -router.include_router(stt_router) diff --git a/surfsense_backend/app/routes/documents_routes.py b/surfsense_backend/app/routes/documents_routes.py index dd7b56033..b9c194e23 100644 --- a/surfsense_backend/app/routes/documents_routes.py +++ b/surfsense_backend/app/routes/documents_routes.py @@ -784,25 +784,43 @@ async def process_file_in_background( {"file_type": "audio", "processing_stage": "starting_transcription"}, ) - # Open the audio file for transcription - with open(file_path, "rb") as audio_file: + # Check if using local STT service + if app_config.STT_SERVICE and app_config.STT_SERVICE.startswith("local/"): + # Use local Faster-Whisper for transcription + from app.services.stt_service import stt_service + + result = stt_service.transcribe_file(file_path) + transcribed_text = result["text"] + + await task_logger.log_task_progress( + log_entry, + f"Local STT transcription completed: {filename}", + { + "processing_stage": "local_transcription_complete", + "language": result["language"], + "confidence": result["language_probability"], + "duration": result["duration"], + }, + ) + else: # Use LiteLLM for audio transcription - if app_config.STT_SERVICE_API_BASE: - transcription_response = await atranscription( - model=app_config.STT_SERVICE, - file=audio_file, - api_base=app_config.STT_SERVICE_API_BASE, - api_key=app_config.STT_SERVICE_API_KEY, - ) - else: - transcription_response = await atranscription( - model=app_config.STT_SERVICE, - api_key=app_config.STT_SERVICE_API_KEY, - file=audio_file, - ) + with open(file_path, "rb") as audio_file: + if app_config.STT_SERVICE_API_BASE: + transcription_response = await atranscription( + model=app_config.STT_SERVICE, + file=audio_file, + api_base=app_config.STT_SERVICE_API_BASE, + api_key=app_config.STT_SERVICE_API_KEY, + ) + else: + transcription_response = await atranscription( + model=app_config.STT_SERVICE, + api_key=app_config.STT_SERVICE_API_KEY, + file=audio_file, + ) - # Extract the transcribed text - transcribed_text = transcription_response.get("text", "") + # Extract the transcribed text + transcribed_text = transcription_response.get("text", "") # Add metadata about the transcription transcribed_text = ( @@ -831,6 +849,7 @@ async def process_file_in_background( ) if result: + stt_service_type = "local" if app_config.STT_SERVICE and app_config.STT_SERVICE.startswith("local/") else "external" await task_logger.log_task_success( log_entry, f"Successfully transcribed and processed audio file: {filename}", @@ -839,6 +858,7 @@ async def process_file_in_background( "content_hash": result.content_hash, "file_type": "audio", "transcript_length": len(transcribed_text), + "stt_service": stt_service_type, }, ) else: diff --git a/surfsense_backend/app/routes/stt_routes.py b/surfsense_backend/app/routes/stt_routes.py deleted file mode 100644 index ed11cdd5a..000000000 --- a/surfsense_backend/app/routes/stt_routes.py +++ /dev/null @@ -1,96 +0,0 @@ -"""Speech-to-Text API routes.""" - -from fastapi import APIRouter, File, Form, HTTPException, UploadFile -from fastapi.responses import JSONResponse - -from app.services.stt_service import stt_service - -router = APIRouter(prefix="/stt", tags=["Speech-to-Text"]) - - -@router.post("/transcribe") -async def transcribe_audio( - audio: UploadFile = File(..., description="Audio file to transcribe"), - language: str = Form(None, description="Optional language code (e.g., 'en', 'es')"), -): - """Transcribe uploaded audio file to text.""" - - # Validate file type - if not audio.content_type or not audio.content_type.startswith("audio/"): - raise HTTPException( - status_code=400, - detail="File must be an audio file" - ) - - try: - # Read audio bytes - audio_bytes = await audio.read() - - # Transcribe - result = stt_service.transcribe_bytes( - audio_bytes, - filename=audio.filename or "audio.wav", - language=language if language else None - ) - - return JSONResponse(content={ - "success": True, - "transcription": result["text"], - "metadata": { - "detected_language": result["language"], - "language_probability": result["language_probability"], - "duration_seconds": result["duration"], - "model_size": stt_service.model_size, - } - }) - - except Exception as e: - raise HTTPException( - status_code=500, - detail=f"Transcription failed: {str(e)}" - ) - - -@router.get("/models") -async def get_available_models(): - """Get list of available Whisper models.""" - return JSONResponse(content={ - "models": [ - {"name": "tiny", "size": "~39 MB", "speed": "fastest", "accuracy": "lowest"}, - {"name": "base", "size": "~74 MB", "speed": "fast", "accuracy": "good"}, - {"name": "small", "size": "~244 MB", "speed": "medium", "accuracy": "better"}, - {"name": "medium", "size": "~769 MB", "speed": "slow", "accuracy": "high"}, - {"name": "large-v3", "size": "~1550 MB", "speed": "slowest", "accuracy": "highest"}, - ], - "current_model": stt_service.model_size, - "note": "Models are downloaded automatically on first use" - }) - - -@router.post("/change-model") -async def change_model(model_size: str = Form(...)): - """Change the active Whisper model.""" - - valid_models = ["tiny", "base", "small", "medium", "large-v3"] - if model_size not in valid_models: - raise HTTPException( - status_code=400, - detail=f"Invalid model. Choose from: {valid_models}" - ) - - try: - # Create new service instance with different model - global stt_service - stt_service = type(stt_service)(model_size=model_size) - - return JSONResponse(content={ - "success": True, - "message": f"Model changed to {model_size}", - "note": "Model will be downloaded on next transcription if not cached" - }) - - except Exception as e: - raise HTTPException( - status_code=500, - detail=f"Failed to change model: {str(e)}" - ) \ No newline at end of file diff --git a/surfsense_backend/app/services/stt_service.py b/surfsense_backend/app/services/stt_service.py index f5c62781e..273fef05b 100644 --- a/surfsense_backend/app/services/stt_service.py +++ b/surfsense_backend/app/services/stt_service.py @@ -12,13 +12,14 @@ from app.config import config class STTService: """Local Speech-to-Text service using Faster-Whisper.""" - def __init__(self, model_size: Optional[str] = None): - """Initialize STT service with specified model size. - - Args: - model_size: Whisper model size ("tiny", "base", "small", "medium", "large-v3") - """ - self.model_size = model_size or config.LOCAL_STT_MODEL + def __init__(self): + """Initialize STT service with model from STT_SERVICE config.""" + # Parse model from STT_SERVICE (e.g., "local/base" or "local/tiny") + stt_service = config.STT_SERVICE or "local/base" + if stt_service.startswith("local/"): + self.model_size = stt_service.split("/", 1)[1] + else: + self.model_size = "base" # fallback self._model: Optional[WhisperModel] = None def _get_model(self) -> WhisperModel: diff --git a/surfsense_web/components/stt/audio-recorder.tsx b/surfsense_web/components/stt/audio-recorder.tsx deleted file mode 100644 index 7b88dbad4..000000000 --- a/surfsense_web/components/stt/audio-recorder.tsx +++ /dev/null @@ -1,109 +0,0 @@ -"use client"; - -import { useState, useRef } from "react"; -import { Button } from "@/components/ui/button"; -import { Mic, Square, Upload } from "lucide-react"; - -interface AudioRecorderProps { - onTranscription: (text: string) => void; - apiUrl?: string; -} - -export function AudioRecorder({ onTranscription, apiUrl = "/api/v1/stt" }: AudioRecorderProps) { - const [isRecording, setIsRecording] = useState(false); - const [isTranscribing, setIsTranscribing] = useState(false); - const mediaRecorderRef = useRef(null); - const chunksRef = useRef([]); - - const startRecording = async () => { - try { - const stream = await navigator.mediaDevices.getUserMedia({ audio: true }); - const mediaRecorder = new MediaRecorder(stream); - mediaRecorderRef.current = mediaRecorder; - chunksRef.current = []; - - mediaRecorder.ondataavailable = (event) => { - chunksRef.current.push(event.data); - }; - - mediaRecorder.onstop = async () => { - const audioBlob = new Blob(chunksRef.current, { type: "audio/wav" }); - await transcribeAudio(audioBlob); - stream.getTracks().forEach(track => track.stop()); - }; - - mediaRecorder.start(); - setIsRecording(true); - } catch (error) { - console.error("Error starting recording:", error); - } - }; - - const stopRecording = () => { - if (mediaRecorderRef.current && isRecording) { - mediaRecorderRef.current.stop(); - setIsRecording(false); - } - }; - - const transcribeAudio = async (audioBlob: Blob) => { - setIsTranscribing(true); - - const formData = new FormData(); - formData.append("audio", audioBlob, "recording.wav"); - - try { - const response = await fetch(`${apiUrl}/transcribe`, { - method: "POST", - body: formData, - }); - - if (!response.ok) throw new Error("Transcription failed"); - - const result = await response.json(); - onTranscription(result.transcription); - } catch (error) { - console.error("Transcription error:", error); - } finally { - setIsTranscribing(false); - } - }; - - const handleFileUpload = async (event: React.ChangeEvent) => { - const file = event.target.files?.[0]; - if (!file) return; - - await transcribeAudio(file); - }; - - return ( -
- - - - - {isTranscribing && Transcribing...} -
- ); -} \ No newline at end of file