chore: updated docs & refactored sst_service.py

2026-07-26 23:51:14 +02:00 · 2025-10-15 14:31:38 -07:00 · 2025-10-15 14:31:38 -07:00 · 70b547c9c9
commit 70b547c9c9
parent 5ed9aa2b0b
5 changed files with 53 additions and 41 deletions
--- a/surfsense_backend/app/routes/documents_routes.py
+++ b/surfsense_backend/app/routes/documents_routes.py
@ -785,20 +785,25 @@ async def process_file_in_background(
            )

            # Determine STT service type
-            stt_service_type = "local" if app_config.STT_SERVICE and app_config.STT_SERVICE.startswith("local/") else "external"
-            
+            stt_service_type = (
+                "local"
+                if app_config.STT_SERVICE
+                and app_config.STT_SERVICE.startswith("local/")
+                else "external"
+            )
+
            # Check if using local STT service
            if stt_service_type == "local":
                # Use local Faster-Whisper for transcription
                from app.services.stt_service import stt_service
-                
+
                try:
                    result = stt_service.transcribe_file(file_path)
                    transcribed_text = result.get("text", "")
-                    
+
                    if not transcribed_text:
                        raise ValueError("Transcription returned empty text")
-                    
+
                    # Add metadata about the transcription
                    transcribed_text = (
                        f"# Transcription of {filename}\n\n{transcribed_text}"
@ -806,9 +811,9 @@ async def process_file_in_background(
                except Exception as e:
                    raise HTTPException(
                        status_code=422,
-                        detail=f"Failed to transcribe audio file {filename}: {str(e)}"
+                        detail=f"Failed to transcribe audio file {filename}: {e!s}",
                    ) from e
-                
+
                await task_logger.log_task_progress(
                    log_entry,
                    f"Local STT transcription completed: {filename}",
@ -828,13 +833,17 @@ async def process_file_in_background(
                        "api_key": app_config.STT_SERVICE_API_KEY,
                    }
                    if app_config.STT_SERVICE_API_BASE:
-                        transcription_kwargs["api_base"] = app_config.STT_SERVICE_API_BASE
-                    
-                    transcription_response = await atranscription(**transcription_kwargs)
+                        transcription_kwargs["api_base"] = (
+                            app_config.STT_SERVICE_API_BASE
+                        )
+
+                    transcription_response = await atranscription(
+                        **transcription_kwargs
+                    )

                    # Extract the transcribed text
                    transcribed_text = transcription_response.get("text", "")
-                    
+
                    if not transcribed_text:
                        raise ValueError("Transcription returned empty text")

--- a/surfsense_backend/app/services/stt_service.py
+++ b/surfsense_backend/app/services/stt_service.py
@ -3,15 +3,15 @@
 import os
 import tempfile
 from pathlib import Path
-from typing import Optional

 from faster_whisper import WhisperModel
+
 from app.config import config


 class STTService:
    """Local Speech-to-Text service using Faster-Whisper."""
-    
+
    def __init__(self):
        """Initialize STT service with model from STT_SERVICE config."""
        # Parse model from STT_SERVICE (e.g., "local/base" or "local/tiny")
@ -20,8 +20,8 @@ class STTService:
            self.model_size = stt_service.split("/", 1)[1]
        else:
            self.model_size = "base"  # fallback
-        self._model: Optional[WhisperModel] = None
-        
+        self._model: WhisperModel | None = None
+
    def _get_model(self) -> WhisperModel:
        """Lazy load the Whisper model."""
        if self._model is None:
@ -33,49 +33,53 @@ class STTService:
                num_workers=1,  # Single worker for stability
            )
        return self._model
-    
-    def transcribe_file(self, audio_path: str, language: Optional[str] = None) -> dict:
+
+    def transcribe_file(self, audio_path: str, language: str | None = None) -> dict:
        """Transcribe audio file to text.
-        
+
        Args:
            audio_path: Path to audio file
            language: Optional language code (e.g., "en", "es")
-            
+
        Returns:
            Dict with transcription text and metadata
        """
        model = self._get_model()
-        
+
        # Transcribe with optimized settings
        segments, info = model.transcribe(
            audio_path,
            language=language,
            beam_size=1,  # Faster inference
-            best_of=1,    # Single pass
+            best_of=1,  # Single pass
            temperature=0,  # Deterministic output
            vad_filter=True,  # Voice activity detection
-            vad_parameters=dict(min_silence_duration_ms=500),
+            vad_parameters={"min_silence_duration_ms": 500},
        )
-        
+
        # Combine all segments
        text = " ".join(segment.text.strip() for segment in segments)
-        
+
        return {
            "text": text,
            "language": info.language,
            "language_probability": info.language_probability,
            "duration": info.duration,
        }
-    
-    def transcribe_bytes(self, audio_bytes: bytes, filename: str = "audio.wav", 
-                        language: Optional[str] = None) -> dict:
+
+    def transcribe_bytes(
+        self,
+        audio_bytes: bytes,
+        filename: str = "audio.wav",
+        language: str | None = None,
+    ) -> dict:
        """Transcribe audio from bytes.
-        
+
        Args:
            audio_bytes: Audio file bytes
            filename: Original filename for format detection
            language: Optional language code
-            
+
        Returns:
            Dict with transcription text and metadata
        """
@ -84,7 +88,7 @@ class STTService:
        with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as tmp_file:
            tmp_file.write(audio_bytes)
            tmp_path = tmp_file.name
-        
+
        try:
            return self.transcribe_file(tmp_path, language)
        finally:
@ -93,4 +97,4 @@ class STTService:


 # Global STT service instance
-stt_service = STTService()
+stt_service = STTService()
--- a/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/page.tsx
+++ b/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/page.tsx
@ -36,7 +36,7 @@ export default function DocumentsTable() {
 		created_at: true,
 	});
 	const [pageIndex, setPageIndex] = useState(0);
-	const [pageSize, setPageSize] = useState(10);
+	const [pageSize, setPageSize] = useState(50);
 	const [sortKey, setSortKey] = useState<SortKey>("title");
 	const [sortDesc, setSortDesc] = useState(false);
 	const [selectedIds, setSelectedIds] = useState<Set<number>>(new Set());
--- a/surfsense_web/content/docs/docker-installation.mdx
+++ b/surfsense_web/content/docs/docker-installation.mdx
@ -85,10 +85,10 @@ Before you begin, ensure you have:
 | RERANKERS_MODEL_NAME       | Name of the reranker model (e.g., `ms-marco-MiniLM-L-12-v2`)                                                                                                                              |
 | RERANKERS_MODEL_TYPE       | Type of reranker model (e.g., `flashrank`)                                                                                                                                                |
 | TTS_SERVICE                | Text-to-Speech API provider for Podcasts (e.g., `local/kokoro`, `openai/tts-1`). See [supported providers](https://docs.litellm.ai/docs/text_to_speech#supported-providers)                            |
-| TTS_SERVICE_API_KEY        | API key for the Text-to-Speech service                                                                                                                                                    |
+| TTS_SERVICE_API_KEY        | (Optional if local) API key for the Text-to-Speech service                                                                                                                                                    |
 | TTS_SERVICE_API_BASE       | (Optional) Custom API base URL for the Text-to-Speech service                                                                                                                           |
-| STT_SERVICE                | Speech-to-Text API provider for Podcasts (e.g., `openai/whisper-1`). See [supported providers](https://docs.litellm.ai/docs/audio_transcription#supported-providers)                   |
-| STT_SERVICE_API_KEY        | API key for the Speech-to-Text service                                                                                                                                                    |
+| STT_SERVICE                | Speech-to-Text API provider for Audio Files (e.g., `local/base`, `openai/whisper-1`). See [supported providers](https://docs.litellm.ai/docs/audio_transcription#supported-providers)                   |
+| STT_SERVICE_API_KEY        | (Optional if local) API key for the Speech-to-Text service                                                                                                                                                    |
 | STT_SERVICE_API_BASE       | (Optional) Custom API base URL for the Speech-to-Text service                                                                                                                      |
 | FIRECRAWL_API_KEY          | API key for Firecrawl service for web crawling                                                                                                                                            |
 | ETL_SERVICE                | Document parsing service: `UNSTRUCTURED` (supports 34+ formats), `LLAMACLOUD` (supports 50+ formats including legacy document types), or `DOCLING` (local processing, supports PDF, Office docs, images, HTML, CSV)                                                  |
--- a/surfsense_web/content/docs/manual-installation.mdx
+++ b/surfsense_web/content/docs/manual-installation.mdx
@ -62,12 +62,11 @@ Edit the `.env` file and set the following variables:
 | RERANKERS_MODEL_NAME       | Name of the reranker model (e.g., `ms-marco-MiniLM-L-12-v2`)                                                                                                                              |
 | RERANKERS_MODEL_TYPE       | Type of reranker model (e.g., `flashrank`)                                                                                                                                                |
 | TTS_SERVICE                | Text-to-Speech API provider for Podcasts (e.g., `local/kokoro`, `openai/tts-1`). See [supported providers](https://docs.litellm.ai/docs/text_to_speech#supported-providers)                            |
-| TTS_SERVICE_API_KEY        | API key for the Text-to-Speech service                                                                                                                                                    |
-| TTS_SERVICE_API_BASE       | (Optional) Custom API base URL for the Text-to-Speech service                                                                                                                          |
-| STT_SERVICE                | Speech-to-Text API provider for Podcasts (e.g., `openai/whisper-1`). See [supported providers](https://docs.litellm.ai/docs/audio_transcription#supported-providers)                   |
-| STT_SERVICE_API_KEY        | API key for the Speech-to-Text service                                                                                                                                                    |
-| STT_SERVICE_API_BASE       | (Optional) Custom API base URL for the Speech-to-Text service                                                                                                                            |
-| FIRECRAWL_API_KEY          | API key for Firecrawl service for web crawling                                                                                                                                            |
+| TTS_SERVICE_API_KEY        | (Optional if local) API key for the Text-to-Speech service                                                                                                                                                    |
+| TTS_SERVICE_API_BASE       | (Optional) Custom API base URL for the Text-to-Speech service                                                                                                                           |
+| STT_SERVICE                | Speech-to-Text API provider for Audio Files (e.g., `local/base`, `openai/whisper-1`). See [supported providers](https://docs.litellm.ai/docs/audio_transcription#supported-providers)                   |
+| STT_SERVICE_API_KEY        | (Optional if local) API key for the Speech-to-Text service                                                                                                                                                    |
+| STT_SERVICE_API_BASE       | (Optional) Custom API base URL for the Speech-to-Text service                                                                                                                      |
 | ETL_SERVICE                | Document parsing service: `UNSTRUCTURED` (supports 34+ formats), `LLAMACLOUD` (supports 50+ formats including legacy document types), or `DOCLING` (local processing, supports PDF, Office docs, images, HTML, CSV)                                                  |
 | UNSTRUCTURED_API_KEY       | API key for Unstructured.io service for document parsing (required if ETL_SERVICE=UNSTRUCTURED)                                                                                           |
 | LLAMA_CLOUD_API_KEY        | API key for LlamaCloud service for document parsing (required if ETL_SERVICE=LLAMACLOUD)                                                                                                  |