diff --git a/surfsense_backend/app/routes/documents_routes.py b/surfsense_backend/app/routes/documents_routes.py index a3c69fe2a..08a352e75 100644 --- a/surfsense_backend/app/routes/documents_routes.py +++ b/surfsense_backend/app/routes/documents_routes.py @@ -785,20 +785,25 @@ async def process_file_in_background( ) # Determine STT service type - stt_service_type = "local" if app_config.STT_SERVICE and app_config.STT_SERVICE.startswith("local/") else "external" - + stt_service_type = ( + "local" + if app_config.STT_SERVICE + and app_config.STT_SERVICE.startswith("local/") + else "external" + ) + # Check if using local STT service if stt_service_type == "local": # Use local Faster-Whisper for transcription from app.services.stt_service import stt_service - + try: result = stt_service.transcribe_file(file_path) transcribed_text = result.get("text", "") - + if not transcribed_text: raise ValueError("Transcription returned empty text") - + # Add metadata about the transcription transcribed_text = ( f"# Transcription of {filename}\n\n{transcribed_text}" @@ -806,9 +811,9 @@ async def process_file_in_background( except Exception as e: raise HTTPException( status_code=422, - detail=f"Failed to transcribe audio file {filename}: {str(e)}" + detail=f"Failed to transcribe audio file {filename}: {e!s}", ) from e - + await task_logger.log_task_progress( log_entry, f"Local STT transcription completed: {filename}", @@ -828,13 +833,17 @@ async def process_file_in_background( "api_key": app_config.STT_SERVICE_API_KEY, } if app_config.STT_SERVICE_API_BASE: - transcription_kwargs["api_base"] = app_config.STT_SERVICE_API_BASE - - transcription_response = await atranscription(**transcription_kwargs) + transcription_kwargs["api_base"] = ( + app_config.STT_SERVICE_API_BASE + ) + + transcription_response = await atranscription( + **transcription_kwargs + ) # Extract the transcribed text transcribed_text = transcription_response.get("text", "") - + if not transcribed_text: raise ValueError("Transcription returned empty text") diff --git a/surfsense_backend/app/services/stt_service.py b/surfsense_backend/app/services/stt_service.py index 273fef05b..ea38480e8 100644 --- a/surfsense_backend/app/services/stt_service.py +++ b/surfsense_backend/app/services/stt_service.py @@ -3,15 +3,15 @@ import os import tempfile from pathlib import Path -from typing import Optional from faster_whisper import WhisperModel + from app.config import config class STTService: """Local Speech-to-Text service using Faster-Whisper.""" - + def __init__(self): """Initialize STT service with model from STT_SERVICE config.""" # Parse model from STT_SERVICE (e.g., "local/base" or "local/tiny") @@ -20,8 +20,8 @@ class STTService: self.model_size = stt_service.split("/", 1)[1] else: self.model_size = "base" # fallback - self._model: Optional[WhisperModel] = None - + self._model: WhisperModel | None = None + def _get_model(self) -> WhisperModel: """Lazy load the Whisper model.""" if self._model is None: @@ -33,49 +33,53 @@ class STTService: num_workers=1, # Single worker for stability ) return self._model - - def transcribe_file(self, audio_path: str, language: Optional[str] = None) -> dict: + + def transcribe_file(self, audio_path: str, language: str | None = None) -> dict: """Transcribe audio file to text. - + Args: audio_path: Path to audio file language: Optional language code (e.g., "en", "es") - + Returns: Dict with transcription text and metadata """ model = self._get_model() - + # Transcribe with optimized settings segments, info = model.transcribe( audio_path, language=language, beam_size=1, # Faster inference - best_of=1, # Single pass + best_of=1, # Single pass temperature=0, # Deterministic output vad_filter=True, # Voice activity detection - vad_parameters=dict(min_silence_duration_ms=500), + vad_parameters={"min_silence_duration_ms": 500}, ) - + # Combine all segments text = " ".join(segment.text.strip() for segment in segments) - + return { "text": text, "language": info.language, "language_probability": info.language_probability, "duration": info.duration, } - - def transcribe_bytes(self, audio_bytes: bytes, filename: str = "audio.wav", - language: Optional[str] = None) -> dict: + + def transcribe_bytes( + self, + audio_bytes: bytes, + filename: str = "audio.wav", + language: str | None = None, + ) -> dict: """Transcribe audio from bytes. - + Args: audio_bytes: Audio file bytes filename: Original filename for format detection language: Optional language code - + Returns: Dict with transcription text and metadata """ @@ -84,7 +88,7 @@ class STTService: with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as tmp_file: tmp_file.write(audio_bytes) tmp_path = tmp_file.name - + try: return self.transcribe_file(tmp_path, language) finally: @@ -93,4 +97,4 @@ class STTService: # Global STT service instance -stt_service = STTService() \ No newline at end of file +stt_service = STTService() diff --git a/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/page.tsx b/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/page.tsx index b63a20367..bf3ca67a0 100644 --- a/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/page.tsx +++ b/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/page.tsx @@ -36,7 +36,7 @@ export default function DocumentsTable() { created_at: true, }); const [pageIndex, setPageIndex] = useState(0); - const [pageSize, setPageSize] = useState(10); + const [pageSize, setPageSize] = useState(50); const [sortKey, setSortKey] = useState("title"); const [sortDesc, setSortDesc] = useState(false); const [selectedIds, setSelectedIds] = useState>(new Set()); diff --git a/surfsense_web/content/docs/docker-installation.mdx b/surfsense_web/content/docs/docker-installation.mdx index ae0ebadfe..4a9a11626 100644 --- a/surfsense_web/content/docs/docker-installation.mdx +++ b/surfsense_web/content/docs/docker-installation.mdx @@ -85,10 +85,10 @@ Before you begin, ensure you have: | RERANKERS_MODEL_NAME | Name of the reranker model (e.g., `ms-marco-MiniLM-L-12-v2`) | | RERANKERS_MODEL_TYPE | Type of reranker model (e.g., `flashrank`) | | TTS_SERVICE | Text-to-Speech API provider for Podcasts (e.g., `local/kokoro`, `openai/tts-1`). See [supported providers](https://docs.litellm.ai/docs/text_to_speech#supported-providers) | -| TTS_SERVICE_API_KEY | API key for the Text-to-Speech service | +| TTS_SERVICE_API_KEY | (Optional if local) API key for the Text-to-Speech service | | TTS_SERVICE_API_BASE | (Optional) Custom API base URL for the Text-to-Speech service | -| STT_SERVICE | Speech-to-Text API provider for Podcasts (e.g., `openai/whisper-1`). See [supported providers](https://docs.litellm.ai/docs/audio_transcription#supported-providers) | -| STT_SERVICE_API_KEY | API key for the Speech-to-Text service | +| STT_SERVICE | Speech-to-Text API provider for Audio Files (e.g., `local/base`, `openai/whisper-1`). See [supported providers](https://docs.litellm.ai/docs/audio_transcription#supported-providers) | +| STT_SERVICE_API_KEY | (Optional if local) API key for the Speech-to-Text service | | STT_SERVICE_API_BASE | (Optional) Custom API base URL for the Speech-to-Text service | | FIRECRAWL_API_KEY | API key for Firecrawl service for web crawling | | ETL_SERVICE | Document parsing service: `UNSTRUCTURED` (supports 34+ formats), `LLAMACLOUD` (supports 50+ formats including legacy document types), or `DOCLING` (local processing, supports PDF, Office docs, images, HTML, CSV) | diff --git a/surfsense_web/content/docs/manual-installation.mdx b/surfsense_web/content/docs/manual-installation.mdx index d1a5a86dc..5ce285348 100644 --- a/surfsense_web/content/docs/manual-installation.mdx +++ b/surfsense_web/content/docs/manual-installation.mdx @@ -62,12 +62,11 @@ Edit the `.env` file and set the following variables: | RERANKERS_MODEL_NAME | Name of the reranker model (e.g., `ms-marco-MiniLM-L-12-v2`) | | RERANKERS_MODEL_TYPE | Type of reranker model (e.g., `flashrank`) | | TTS_SERVICE | Text-to-Speech API provider for Podcasts (e.g., `local/kokoro`, `openai/tts-1`). See [supported providers](https://docs.litellm.ai/docs/text_to_speech#supported-providers) | -| TTS_SERVICE_API_KEY | API key for the Text-to-Speech service | -| TTS_SERVICE_API_BASE | (Optional) Custom API base URL for the Text-to-Speech service | -| STT_SERVICE | Speech-to-Text API provider for Podcasts (e.g., `openai/whisper-1`). See [supported providers](https://docs.litellm.ai/docs/audio_transcription#supported-providers) | -| STT_SERVICE_API_KEY | API key for the Speech-to-Text service | -| STT_SERVICE_API_BASE | (Optional) Custom API base URL for the Speech-to-Text service | -| FIRECRAWL_API_KEY | API key for Firecrawl service for web crawling | +| TTS_SERVICE_API_KEY | (Optional if local) API key for the Text-to-Speech service | +| TTS_SERVICE_API_BASE | (Optional) Custom API base URL for the Text-to-Speech service | +| STT_SERVICE | Speech-to-Text API provider for Audio Files (e.g., `local/base`, `openai/whisper-1`). See [supported providers](https://docs.litellm.ai/docs/audio_transcription#supported-providers) | +| STT_SERVICE_API_KEY | (Optional if local) API key for the Speech-to-Text service | +| STT_SERVICE_API_BASE | (Optional) Custom API base URL for the Speech-to-Text service | | ETL_SERVICE | Document parsing service: `UNSTRUCTURED` (supports 34+ formats), `LLAMACLOUD` (supports 50+ formats including legacy document types), or `DOCLING` (local processing, supports PDF, Office docs, images, HTML, CSV) | | UNSTRUCTURED_API_KEY | API key for Unstructured.io service for document parsing (required if ETL_SERVICE=UNSTRUCTURED) | | LLAMA_CLOUD_API_KEY | API key for LlamaCloud service for document parsing (required if ETL_SERVICE=LLAMACLOUD) |