Merge branch 'main' into feat/telnyx-telephony

2026-06-10 08:05:22 +02:00 · 2026-03-25 16:15:29 +05:30 · 2026-03-25 16:15:29 +05:30 · 9dc64456d8
commit 9dc64456d8
parent 6efa38002f dc800bdd63
39 changed files with 1071 additions and 313 deletions
--- a/.gitignore
+++ b/.gitignore
@ -16,3 +16,4 @@ venv/
 .playwright-mcp
 coturn/
 *.wav
+dograh_pcm_cache/
--- a/api/db/workflow_recording_client.py
+++ b/api/db/workflow_recording_client.py
@ -64,7 +64,7 @@ class WorkflowRecordingClient(BaseDBClient):
                storage_key=storage_key,
                storage_backend=storage_backend,
                created_by=created_by,
-                metadata=metadata or {},
+                recording_metadata=metadata or {},
            )

            session.add(recording)
--- a/api/routes/s3_signed_url.py
+++ b/api/routes/s3_signed_url.py
@ -40,6 +40,38 @@ class PresignedUploadUrlResponse(BaseModel):
 router = APIRouter(prefix="/s3", tags=["s3"])


+def _extract_org_id_from_key(key: str) -> Optional[int]:
+    """Try to extract an organization ID from a storage key.
+
+    Matches keys of the form ``{prefix}/{org_id}/...`` where *org_id* is a
+    positive integer.  Returns ``None`` when the pattern does not match.
+    """
+    parts = key.split("/")
+    if len(parts) >= 3 and parts[1].isdigit():
+        return int(parts[1])
+    return None
+
+
+def _extract_legacy_workflow_run_id(key: str) -> Optional[int]:
+    """Extract a workflow_run_id from legacy key formats.
+
+    Supports:
+      - ``transcripts/{run_id}.txt``
+      - ``recordings/{run_id}.wav``
+
+    Returns ``None`` when the key does not match a legacy pattern.
+    """
+    if key.startswith("transcripts/") and key.endswith(".txt"):
+        run_id_str = key[len("transcripts/") : -4]
+    elif key.startswith("recordings/") and key.endswith(".wav"):
+        run_id_str = key[len("recordings/") : -4]
+    else:
+        return None
+
+    return int(run_id_str) if run_id_str.isdigit() else None
+
+
+# Keep for backward compat with file-metadata endpoint
 async def _validate_and_extract_workflow_run_id(
    key: str, allow_special_paths: bool = False
 ) -> Optional[int]:
@ -118,64 +150,68 @@ async def get_signed_url(
    key: Annotated[str, Query(description="S3 object key")],
    expires_in: int = 3600,
    inline: bool = False,
+    storage_backend: Annotated[
+        Optional[str],
+        Query(
+            description="Storage backend to use (e.g. 'minio', 's3'). "
+            "When omitted the backend is inferred from the resource."
+        ),
+    ] = None,
    user=Depends(get_user),
 ):
-    """Return a short-lived signed URL for a transcript or recording file stored on S3.
+    """Return a short-lived signed URL for a file stored on S3 / MinIO.

    Access Control:
+    * Keys that embed an organization ID (``{prefix}/{org_id}/...``) are
+      authorized by matching the org_id against the requesting user's
+      organization.
+    * Legacy keys (``recordings/{run_id}.wav``, ``transcripts/{run_id}.txt``)
+      are authorized via the workflow run they belong to.
    * Superusers can request any key.
-    * Regular users can only request resources belonging to **their** workflow runs.
    """

-    # Validate key and extract workflow_run_id (don't allow special paths for signed URLs)
-    run_id = await _validate_and_extract_workflow_run_id(key, allow_special_paths=False)
-    if run_id is None:
-        raise HTTPException(status_code=400, detail="Invalid key format")
+    # ------------------------------------------------------------------
+    # 1. Authorize
+    # ------------------------------------------------------------------
+    workflow_run = None

-    # Authorize and get workflow run
-    workflow_run = await _authorize_and_get_workflow_run(run_id, user)
+    org_id = _extract_org_id_from_key(key)
+    if org_id is not None:
+        # Generic org-based auth
+        if not user.is_superuser and org_id != user.selected_organization_id:
+            raise HTTPException(status_code=403, detail="Access denied")
+    else:
+        # Legacy workflow-run-based auth
+        run_id = _extract_legacy_workflow_run_id(key)
+        if run_id is None:
+            raise HTTPException(status_code=400, detail="Invalid key format")
+        workflow_run = await _authorize_and_get_workflow_run(run_id, user)

    # ------------------------------------------------------------------
-    # 3. Generate the signed URL using the correct storage backend
+    # 2. Resolve storage backend
    # ------------------------------------------------------------------
    try:
-        # Use the storage backend recorded when the file was uploaded
-        if (
+        if storage_backend:
+            storage = get_storage_for_backend(storage_backend)
+        elif (
            workflow_run
            and hasattr(workflow_run, "storage_backend")
            and workflow_run.storage_backend
        ):
-            backend = workflow_run.storage_backend
-            storage = get_storage_for_backend(backend)
-            logger.info(
-                f"DOWNLOAD: Using stored {backend} (value: {backend}) for signed URL generation - workflow_run_id: {run_id}, key: {key}"
-            )
+            storage = get_storage_for_backend(workflow_run.storage_backend)
        else:
-            # Fallback to current storage for legacy records without storage_backend
            storage = storage_fs
-            current_backend = StorageBackend.get_current_backend()
-            logger.warning(
-                f"DOWNLOAD: No storage_backend found for workflow run {run_id}, falling back to current {current_backend.name} - key: {key}"
-            )

+        # ------------------------------------------------------------------
+        # 3. Generate the signed URL
+        # ------------------------------------------------------------------
        url = await storage.aget_signed_url(
            key, expiration=expires_in, force_inline=inline
        )
        if not url:
            raise HTTPException(status_code=500, detail="Failed to generate signed URL")

-        # Log successful URL generation
-        backend_info = (
-            f"stored {backend}"
-            if workflow_run
-            and hasattr(workflow_run, "storage_backend")
-            and workflow_run.storage_backend
-            else f"current {StorageBackend.get_current_backend().name}"
-        )
-        logger.info(
-            f"Successfully generated signed URL using {backend_info} - expires in {expires_in}s"
-        )
-
+        logger.info(f"Generated signed URL for key={key}, expires_in={expires_in}s")
        return {"url": url, "expires_in": expires_in}
    except ClientError as exc:
        logger.error(f"Error generating signed URL: {exc}")
--- a/api/routes/workflow_recording.py
+++ b/api/routes/workflow_recording.py
@ -2,9 +2,10 @@

 from typing import Annotated, Optional

-from fastapi import APIRouter, Depends, HTTPException, Query
+from fastapi import APIRouter, Depends, File, Form, HTTPException, Query, UploadFile
 from loguru import logger

+from api.constants import DEPLOYMENT_MODE
 from api.db import db_client
 from api.db.workflow_recording_client import generate_short_id
 from api.enums import StorageBackend
@ -16,6 +17,7 @@ from api.schemas.workflow_recording import (
    RecordingUploadResponseSchema,
 )
 from api.services.auth.depends import get_user
+from api.services.mps_service_key_client import mps_service_key_client
 from api.services.storage import storage_fs

 router = APIRouter(prefix="/workflow-recordings", tags=["workflow-recordings"])
@ -216,3 +218,42 @@ async def delete_recording(
        raise HTTPException(
            status_code=500, detail="Failed to delete recording"
        ) from exc
+
+
+@router.post(
+    "/transcribe",
+    summary="Transcribe an audio file",
+)
+async def transcribe_audio(
+    file: UploadFile = File(...),
+    language: str = Form("en"),
+    user=Depends(get_user),
+):
+    """Transcribe an uploaded audio file using MPS STT."""
+    try:
+        audio_data = await file.read()
+
+        if DEPLOYMENT_MODE == "oss":
+            result = await mps_service_key_client.transcribe_audio(
+                audio_data=audio_data,
+                filename=file.filename or "audio.wav",
+                content_type=file.content_type or "audio/wav",
+                language=language,
+                created_by=str(user.provider_id),
+            )
+        else:
+            result = await mps_service_key_client.transcribe_audio(
+                audio_data=audio_data,
+                filename=file.filename or "audio.wav",
+                content_type=file.content_type or "audio/wav",
+                language=language,
+                organization_id=user.selected_organization_id,
+            )
+
+        return result
+
+    except Exception as exc:
+        logger.error(f"Error transcribing audio: {exc}")
+        raise HTTPException(
+            status_code=500, detail="Failed to transcribe audio"
+        ) from exc
--- a/api/services/configuration/check_validity.py
+++ b/api/services/configuration/check_validity.py
@ -40,6 +40,7 @@ class UserConfigurationValidator:
            ServiceProviders.SPEECHMATICS.value: self._check_speechmatics_api_key,
            ServiceProviders.CAMB.value: self._check_camb_api_key,
            ServiceProviders.AWS_BEDROCK.value: self._check_aws_bedrock_api_key,
+            ServiceProviders.SELF_HOSTED.value: self._check_self_hosted_api_key,
        }

    async def validate(self, configuration: UserConfiguration) -> APIKeyStatusResponse:
@ -74,6 +75,20 @@ class UserConfigurationValidator:

        provider = service_config.provider

+        # Self-hosted doesn't require an API key
+        if provider == ServiceProviders.SELF_HOSTED.value:
+            try:
+                if not self._check_self_hosted_api_key(provider, service_config):
+                    return [
+                        {
+                            "model": service_name,
+                            "message": f"Invalid {provider} configuration",
+                        }
+                    ]
+            except ValueError as e:
+                return [{"model": service_name, "message": str(e)}]
+            return []
+
        # AWS Bedrock uses AWS credentials instead of api_key
        if provider == ServiceProviders.AWS_BEDROCK.value:
            try:
@ -163,7 +178,12 @@ class UserConfigurationValidator:

    def _check_camb_api_key(self, model: str, api_key: str) -> bool:
        return True
-      
+
+    def _check_self_hosted_api_key(self, model: str, service_config) -> bool:
+        if not getattr(service_config, "base_url", None):
+            raise ValueError("base_url is required for self-hosted LLM")
+        return True
+
    def _check_aws_bedrock_api_key(self, model: str, service_config) -> bool:
        if not service_config.aws_access_key or not service_config.aws_secret_key:
            raise ValueError("AWS access key and secret key are required for Bedrock")
--- a/api/services/configuration/registry.py
+++ b/api/services/configuration/registry.py
@ -27,6 +27,7 @@ class ServiceProviders(str, Enum):
    SPEECHMATICS = "speechmatics"
    CAMB = "camb"
    AWS_BEDROCK = "aws_bedrock"
+    SELF_HOSTED = "self_hosted"


 class BaseServiceConfiguration(BaseModel):
@ -40,6 +41,7 @@ class BaseServiceConfiguration(BaseModel):
        ServiceProviders.AZURE,
        ServiceProviders.DOGRAH,
        ServiceProviders.AWS_BEDROCK,
+        ServiceProviders.SELF_HOSTED,
        # ServiceProviders.SARVAM,
    ]
    api_key: str | list[str]
@ -249,6 +251,22 @@ class AWSBedrockLLMConfiguration(BaseLLMConfiguration):
    api_key: str | list[str] | None = Field(default=None)


+SELF_HOSTED_LLM_MODELS = ["llama3", "mistral", "phi3", "qwen2", "gemma2", "deepseek-r1"]
+
+
+@register_llm
+class SelfHostedLLMConfiguration(BaseLLMConfiguration):
+    provider: Literal[ServiceProviders.SELF_HOSTED] = ServiceProviders.SELF_HOSTED
+    model: str = Field(
+        default="llama3", json_schema_extra={"examples": SELF_HOSTED_LLM_MODELS}
+    )
+    base_url: str = Field(
+        default="http://localhost:11434/v1",
+        description="OpenAI-compatible endpoint (Ollama, vLLM, etc.)",
+    )
+    api_key: str | list[str] | None = Field(default=None)
+
+
 LLMConfig = Annotated[
    Union[
        OpenAILLMService,
@ -258,6 +276,7 @@ LLMConfig = Annotated[
        AzureLLMService,
        DograhLLMService,
        AWSBedrockLLMConfiguration,
+        SelfHostedLLMConfiguration,
    ],
    Field(discriminator="provider"),
 ]
@ -334,6 +353,12 @@ class CartesiaTTSConfiguration(BaseTTSConfiguration):
    )
    voice: str = Field(default="3faa81ae-d3d8-4ab1-9e44-e50e46d33c30")
    speed: float = Field(default=1.0, ge=0.6, le=1.5, description="Speed of the voice")
+    volume: float = Field(
+        default=1.0,
+        ge=0.5,
+        le=2.0,
+        description="Volume multiplier for generated speech",
+    )


 SARVAM_TTS_MODELS = ["bulbul:v2", "bulbul:v3"]
--- a/api/services/mps_service_key_client.py
+++ b/api/services/mps_service_key_client.py
@ -351,6 +351,71 @@ class MPSServiceKeyClient:
                    response=response,
                )

+    async def transcribe_audio(
+        self,
+        audio_data: bytes,
+        filename: str = "audio.wav",
+        content_type: str = "audio/wav",
+        language: str = "en",
+        model: str = "default",
+        correlation_id: Optional[str] = None,
+        organization_id: Optional[int] = None,
+        created_by: Optional[str] = None,
+    ) -> dict:
+        """
+        Transcribe an audio file via MPS STT API.
+
+        Args:
+            audio_data: Raw audio bytes
+            filename: Name of the audio file
+            content_type: MIME type of the audio (e.g., audio/wav, audio/mp3)
+            language: Language code for transcription (default: "en")
+            model: Model tier name (default: "default")
+            correlation_id: Optional correlation ID for tracking
+            organization_id: Organization ID (for authenticated mode)
+            created_by: User provider ID (for OSS mode)
+
+        Returns:
+            Dictionary containing transcription result with keys like
+            'transcript', 'duration_seconds', etc.
+
+        Raises:
+            httpx.HTTPStatusError: If the API call fails
+        """
+        async with httpx.AsyncClient(timeout=httpx.Timeout(60.0)) as client:
+            files = {
+                "file": (filename, audio_data, content_type),
+            }
+            data = {
+                "language": language,
+                "model": model,
+            }
+            if correlation_id:
+                data["correlation_id"] = correlation_id
+
+            headers = self._get_headers(organization_id, created_by)
+            # Remove Content-Type so httpx sets the correct multipart boundary
+            headers.pop("Content-Type", None)
+
+            response = await client.post(
+                f"{self.base_url}/api/v1/stt/transcribe",
+                files=files,
+                data=data,
+                headers=headers,
+            )
+
+            if response.status_code == 200:
+                return response.json()
+            else:
+                logger.error(
+                    f"Failed to transcribe audio: {response.status_code} - {response.text}"
+                )
+                raise httpx.HTTPStatusError(
+                    f"Failed to transcribe audio: {response.text}",
+                    request=response.request,
+                    response=response,
+                )
+
    def validate_service_key(self, service_key: str) -> bool:
        """
        Synchronously validate a Dograh service key by checking usage via MPS.
--- a/api/services/pipecat/realtime_feedback_observer.py
+++ b/api/services/pipecat/realtime_feedback_observer.py
@ -165,49 +165,39 @@ class RealtimeFeedbackObserver(BaseObserver):
        frame = data.frame
        frame_direction = data.direction

-        logger.trace(f"{self} Received Frame: {frame} Direction: {frame_direction}")
-
-        # Handle pipeline termination - stop clock task
-        if isinstance(frame, (EndFrame, CancelFrame, StopFrame)):
-            await self._cancel_clock_task()
-            return
-
-        # Handle interruptions - clear any queued bot text
-        if isinstance(frame, InterruptionFrame):
-            await self._handle_interruption()
-            return
-
-        # Bot speaking state - WS only (ephemeral state signals, not persisted)
-        if isinstance(frame, BotStartedSpeakingFrame):
-            await self._send_ws(
-                {"type": RealtimeFeedbackType.BOT_STARTED_SPEAKING.value, "payload": {}}
-            )
-            return
-        if isinstance(frame, BotStoppedSpeakingFrame):
-            await self._send_ws(
-                {"type": RealtimeFeedbackType.BOT_STOPPED_SPEAKING.value, "payload": {}}
-            )
-            return
-
-        # User mute state - WS only (ephemeral state signals, not persisted)
-        if isinstance(frame, UserMuteStartedFrame):
-            await self._send_ws(
-                {"type": RealtimeFeedbackType.USER_MUTE_STARTED.value, "payload": {}}
-            )
-            return
-        if isinstance(frame, UserMuteStoppedFrame):
-            await self._send_ws(
-                {"type": RealtimeFeedbackType.USER_MUTE_STOPPED.value, "payload": {}}
-            )
-            return
-
        # Skip already processed frames (frames can be observed multiple times)
        if frame.id in self._frames_seen:
            return
        self._frames_seen.add(frame.id)

+        logger.trace(f"{self} Received Frame: {frame} Direction: {frame_direction}")
+
+        # Handle pipeline termination - stop clock task
+        if isinstance(frame, (EndFrame, CancelFrame, StopFrame)):
+            await self._cancel_clock_task()
+        # Handle interruptions - clear any queued bot text
+        elif isinstance(frame, InterruptionFrame):
+            await self._handle_interruption()
+        # Bot speaking state - WS only (ephemeral state signals, not persisted)
+        elif isinstance(frame, BotStartedSpeakingFrame):
+            await self._send_ws(
+                {"type": RealtimeFeedbackType.BOT_STARTED_SPEAKING.value, "payload": {}}
+            )
+        elif isinstance(frame, BotStoppedSpeakingFrame):
+            await self._send_ws(
+                {"type": RealtimeFeedbackType.BOT_STOPPED_SPEAKING.value, "payload": {}}
+            )
+        # User mute state - WS only (ephemeral state signals, not persisted)
+        elif isinstance(frame, UserMuteStartedFrame):
+            await self._send_ws(
+                {"type": RealtimeFeedbackType.USER_MUTE_STARTED.value, "payload": {}}
+            )
+        elif isinstance(frame, UserMuteStoppedFrame):
+            await self._send_ws(
+                {"type": RealtimeFeedbackType.USER_MUTE_STOPPED.value, "payload": {}}
+            )
        # Handle user transcriptions (interim) - WebSocket only
-        if isinstance(frame, InterimTranscriptionFrame):
+        elif isinstance(frame, InterimTranscriptionFrame):
            await self._send_ws(
                {
                    "type": RealtimeFeedbackType.USER_TRANSCRIPTION.value,
--- a/api/services/pipecat/recording_router_processor.py
+++ b/api/services/pipecat/recording_router_processor.py
@ -66,6 +66,7 @@ class RecordingRouterProcessor(FrameProcessor):
        self._frame_buffer: list[tuple[LLMTextFrame, FrameDirection]] = []
        self._mode: Optional[str] = None  # None = detecting, "tts", "recording"
        self._recording_id_buffer = ""
+        self._recording_playback_started = False

    # ------------------------------------------------------------------
    # Frame dispatch
@ -99,9 +100,15 @@ class RecordingRouterProcessor(FrameProcessor):
            await self.push_frame(frame, direction)
            return

-        # --- Recording mode: accumulate recording_id silently ---
+        # --- Recording mode: accumulate text and start playback ASAP ---
        if self._mode == "recording":
            self._recording_id_buffer += frame.text
+            if not self._recording_playback_started:
+                buf = self._recording_id_buffer.lstrip()
+                if " " in buf:
+                    recording_id = buf.split()[0]
+                    self._recording_playback_started = True
+                    await self._play_recording(recording_id)
            return

        # --- Detection mode: buffer until marker found ---
@ -178,16 +185,21 @@ class RecordingRouterProcessor(FrameProcessor):
        self, frame: LLMFullResponseEndFrame, direction: FrameDirection
    ):
        if self._mode == "recording":
-            recording_id = self._recording_id_buffer.strip()
-            if recording_id:
-                # Push accumulated text as TTSTextFrame for UI feedback via observer
+            full_text = self._recording_id_buffer.strip()
+            if full_text:
+                recording_id = full_text.split()[0]
+
+                # Push full text (marker + id + transcript) for assistant context
                await self.push_frame(
                    TTSTextFrame(
                        text=RECORDING_MARKER + self._recording_id_buffer,
                        aggregated_by="recording_router",
                    )
                )
-                await self._play_recording(recording_id)
+
+                # Fallback: if response ended before a space arrived (no transcript)
+                if not self._recording_playback_started:
+                    await self._play_recording(recording_id)
            else:
                logger.warning(
                    "RecordingRouterProcessor: recording mode but empty recording_id"
@ -256,3 +268,4 @@ class RecordingRouterProcessor(FrameProcessor):
        self._frame_buffer = []
        self._mode = None
        self._recording_id_buffer = ""
+        self._recording_playback_started = False
--- a/api/services/pipecat/service_factory.py
+++ b/api/services/pipecat/service_factory.py
@ -8,7 +8,11 @@ from api.services.configuration.registry import ServiceProviders
 from pipecat.services.aws.llm import AWSBedrockLLMService, AWSBedrockLLMSettings
 from pipecat.services.azure.llm import AzureLLMService, AzureLLMSettings
 from pipecat.services.cartesia.stt import CartesiaSTTService
-from pipecat.services.cartesia.tts import CartesiaTTSService, CartesiaTTSSettings, GenerationConfig
+from pipecat.services.cartesia.tts import (
+    CartesiaTTSService,
+    CartesiaTTSSettings,
+    GenerationConfig,
+)
 from pipecat.services.deepgram.flux.stt import (
    DeepgramFluxSTTService,
    DeepgramFluxSTTSettings,
@ -212,13 +216,19 @@ def create_tts_service(user_config, audio_config: "AudioConfig"):
        )
    elif user_config.tts.provider == ServiceProviders.CARTESIA.value:
        speed = getattr(user_config.tts, "speed", None)
-        generation_config = GenerationConfig(speed=speed) if speed and speed != 1.0 else None
+        generation_config = (
+            GenerationConfig(speed=speed) if speed and speed != 1.0 else None
+        )
        return CartesiaTTSService(
            api_key=user_config.tts.api_key,
            settings=CartesiaTTSSettings(
                voice=user_config.tts.voice,
                model=user_config.tts.model,
-                **({"generation_config": generation_config} if generation_config else {}),
+                **(
+                    {"generation_config": generation_config}
+                    if generation_config
+                    else {}
+                ),
            ),
            text_filters=[xml_function_tag_filter],
            silence_time_s=1.0,
@ -353,6 +363,12 @@ def create_llm_service_from_provider(
            aws_region=aws_region,
            settings=AWSBedrockLLMSettings(model=model),
        )
+    elif provider == ServiceProviders.SELF_HOSTED.value:
+        return OpenAILLMService(
+            base_url=base_url or "http://localhost:11434/v1",
+            api_key=api_key or "none",
+            settings=OpenAILLMSettings(model=model),
+        )
    else:
        raise HTTPException(status_code=400, detail=f"Invalid LLM provider {provider}")

@ -368,6 +384,8 @@ def create_llm_service(user_config):
        kwargs["base_url"] = user_config.llm.base_url
    elif provider == ServiceProviders.AZURE.value:
        kwargs["endpoint"] = user_config.llm.endpoint
+    elif provider == ServiceProviders.SELF_HOSTED.value:
+        kwargs["base_url"] = user_config.llm.base_url
    elif provider == ServiceProviders.AWS_BEDROCK.value:
        kwargs["aws_access_key"] = user_config.llm.aws_access_key
        kwargs["aws_secret_key"] = user_config.llm.aws_secret_key
--- a/api/services/workflow/pipecat_engine.py
+++ b/api/services/workflow/pipecat_engine.py
@ -437,9 +437,7 @@ class PipecatEngine:

        async def _do_extraction():
            try:
-                logger.debug(
-                    f"Starting variable extraction for node: {node.name}"
-                )
+                logger.debug(f"Starting variable extraction for node: {node.name}")
                extracted_data = (
                    await self._variable_extraction_manager._perform_extraction(
                        extraction_variables, parent_context, extraction_prompt
@ -454,7 +452,9 @@ class PipecatEngine:
                    f"Variable extraction completed for node: {node.name}. Extracted: {extracted_data}"
                )
            except Exception as e:
-                logger.error(f"Error during variable extraction for node {node.name}: {str(e)}")
+                logger.error(
+                    f"Error during variable extraction for node {node.name}: {str(e)}"
+                )

        if run_in_background:
            logger.debug(
@ -497,9 +497,7 @@ class PipecatEngine:
                    logger.error(
                        f"Pending extraction task '{task_name}' failed: {result}"
                    )
-            logger.debug(
-                f"All pending extraction tasks completed in {elapsed:.2f}s"
-            )
+            logger.debug(f"All pending extraction tasks completed in {elapsed:.2f}s")
        except asyncio.TimeoutError:
            incomplete = [
                t.get_name() for t in self._pending_extraction_tasks if not t.done()
--- a/api/services/workflow/pipecat_engine_context_composer.py
+++ b/api/services/workflow/pipecat_engine_context_composer.py
@ -34,13 +34,13 @@ You have two modes for responding:
   Example: ▸ Hello! How can I help you today?

 2. PRE-RECORDED AUDIO (●): Play a pre-recorded audio message.
-   Format: `●` followed by a space and ONLY the recording_id. Nothing else.
-   Example: ● rec_greeting_01
+   Format: `●` followed by a space followed by recording_id followed by provided transcript. Nothing else.
+   Example: ● rec_greeting_01 [ Provided Transcript ]

 RULES:
 - Your response MUST start with either `▸` or `●` as the very first character.
 - For `▸` (dynamic speech): Follow with a space and your full response text.
- For `●` (pre-recorded audio): Follow with a space and ONLY the recording_id. No other text.
+- For `●` (pre-recorded audio): Follow with a space and the recording_id and the provided transcript. No other text.
 - Use `●` when a pre-recorded message matches the situation well.
 - Use `▸` when you need to generate a dynamic, contextual response.
 - NEVER mix modes in a single response. Choose one."""
@ -77,11 +77,8 @@ def compose_system_prompt_for_node(

    parts = [p for p in (global_prompt, formatted_node_prompt) if p]

-    if has_recordings:
+    if has_recordings and "RECORDING_ID:" in formatted_node_prompt:
        parts.append(RECORDING_RESPONSE_MODE_INSTRUCTIONS)
-        # TODO: Append per-node available recordings list here once
-        # Node.recording_ids is populated. The list should include
-        # recording_id and a short description so the LLM can choose.

    return "\n\n".join(parts)

--- a/api/services/workflow/qa/analysis.py
+++ b/api/services/workflow/qa/analysis.py
@ -28,7 +28,9 @@ from api.utils.template_renderer import render_template
 from pipecat.processors.aggregators.llm_context import LLMContext


-async def _run_llm_inference(llm, messages: list[dict], system_prompt: str) -> str | None:
+async def _run_llm_inference(
+    llm, messages: list[dict], system_prompt: str
+) -> str | None:
    """Run a one-shot LLM inference using the pipecat service."""
    context = LLMContext()
    context.set_messages(messages)
@ -51,7 +53,10 @@ async def _generate_conversation_summary(
    ]

    try:
-        summary = await _run_llm_inference(llm, messages, CONVERSATION_SUMMARY_SYSTEM_PROMPT) or ""
+        summary = (
+            await _run_llm_inference(llm, messages, CONVERSATION_SUMMARY_SYSTEM_PROMPT)
+            or ""
+        )

        span_name = f"conversation-summary-before-{node_name}"
        add_qa_span_to_trace(parent_ctx, model, messages, summary, span_name)
--- a/api/services/workflow/qa/node_summary.py
+++ b/api/services/workflow/qa/node_summary.py
@ -154,7 +154,12 @@ async def ensure_node_summaries(
        try:
            context = LLMContext()
            context.set_messages(messages)
-            summary_text = await llm.run_inference(context, system_instruction=NODE_SUMMARY_SYSTEM_PROMPT) or ""
+            summary_text = (
+                await llm.run_inference(
+                    context, system_instruction=NODE_SUMMARY_SYSTEM_PROMPT
+                )
+                or ""
+            )
        except Exception as e:
            logger.warning(f"Failed to generate summary for node {node_id}: {e}")
            updated_summaries[node_id] = {"summary": ""}
--- a/api/tests/test_camb_tts_integration.py
+++ b/api/tests/test_camb_tts_integration.py
@ -9,7 +9,7 @@ Covers:
 """

 from types import SimpleNamespace
-from unittest.mock import AsyncMock, MagicMock, patch
+from unittest.mock import MagicMock, patch

 import pytest
 from pydantic import ValidationError
@ -17,13 +17,12 @@ from pydantic import ValidationError
 from api.services.configuration.check_validity import UserConfigurationValidator
 from api.services.configuration.registry import (
    CAMB_TTS_MODELS,
-    CambTTSConfiguration,
    REGISTRY,
+    CambTTSConfiguration,
    ServiceProviders,
    ServiceType,
 )

-
 # ---------------------------------------------------------------------------
 # 1. CambTTSConfiguration model tests
 # ---------------------------------------------------------------------------
--- a/docs/docs.json
+++ b/docs/docs.json
@ -54,6 +54,7 @@
            "pages": [
              "voice-agent/introduction",
              "voice-agent/editing-a-workflow",
+              "voice-agent/custom-recordings",
              "voice-agent/template-variables",
              {
                "group": "Tools",
--- a/docs/voice-agent/custom-recordings.mdx
+++ b/docs/voice-agent/custom-recordings.mdx
@ -0,0 +1,79 @@
+---
+title: "Custom Recordings"
+description: "Build hybrid voice agents that combine pre-recorded audio with dynamic text generation for lower latency, reduced TTS costs, and natural-sounding conversations."
+---
+
+Custom recordings allow you to build **hybrid voice agents** that use your own pre-recorded audio for key parts of the conversation, while falling back to LLM-generated speech (via a cloned voice) for dynamic responses. This gives you the best of both worlds — the emotional depth of real human speech and the flexibility of AI-generated dialogue.
+
+<iframe
+  width="560"
+  height="315"
+  src="https://www.youtube.com/embed/1uZqhG0_cIo"
+  title="YouTube video player"
+  frameborder="0"
+  allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture; web-share"
+  referrerpolicy="strict-origin-when-cross-origin"
+  allowfullscreen>
+</iframe>
+
+## Why use custom recordings?
+
+- **Reduced TTS cost** — Pre-recorded audio is played directly, so you are not charged for TTS synthesis on those segments.
+- **Emotional variance** — Real recordings carry natural intonation and emotion that TTS cannot fully replicate.
+- **Lower latency** — Playing a pre-recorded clip is faster than synthesizing text at runtime.
+
+## Prerequisites
+
+- A TTS provider that supports **voice cloning** (e.g., Cartesia, ElevenLabs, or Deepgram).
+- An API key for your chosen TTS provider, configured in [Voice settings](/configurations/voice).
+
+## Step 1: Clone your voice
+
+Clone your voice with your TTS provider so that dynamically generated speech sounds similar to your recordings. For example, with Cartesia:
+
+1. Go to Cartesia and navigate to **Instant Clone**.
+2. Record a short audio clip (up to 10 seconds) of your voice.
+3. Give the clone a name and select your language.
+4. Copy the **Voice ID** — you will need it in the next step.
+
+<Note>
+You can use any TTS provider that supports voice cloning. The steps will vary by provider, but the key output is always a **Voice ID** tied to your cloned voice.
+</Note>
+
+## Step 2: Configure the cloned voice in Dograh
+
+1. Go to your agent's **Model Configuration** in the Dograh dashboard.
+2. Under voice settings, select **Add Voice ID manually**.
+3. Paste the Voice ID from your cloned voice.
+4. Make sure the **provider** matches where you cloned your voice (e.g., Cartesia).
+5. Enter the provider's API key if you haven't already.
+6. Save the configuration.
+
+## Step 3: Upload recordings
+
+Navigate to your agent in the workflow builder and open the **Recordings** panel. You can either upload pre-recorded audio files or record directly in the browser.
+
+For each recording:
+
+1. Click **Record** (or upload a file).
+2. Speak the exact phrase you want the agent to use.
+3. Give the recording a descriptive name (e.g., `greeting`, `invitation`, `venue`).
+4. Verify the transcription is correct — edit it if needed.
+5. Click **Upload**.
+
+<Warning>
+Recordings are scoped to a specific **provider and Voice ID**. If you change either, you will need to re-upload your recordings to ensure consistency between the recorded audio and the cloned voice used for dynamic responses.
+</Warning>
+
+## Step 4: Build the workflow
+
+Open your agent's workflow and write the conversation flow in natural language. To insert a recording, type **`@`** in the prompt editor — this will show a list of all available recordings scoped to your current Voice ID.
+
+For any user question that falls outside your recordings, the agent automatically generates a dynamic response using the LLM, which is then synthesized using your cloned voice via TTS.
+
+## Tips for best results
+
+- **Record in a quiet environment** to improve audio quality and consistency with the cloned voice.
+- **Use pro cloning services** (when available) and provide more sample audio for a higher-quality voice clone.
+- **Keep recordings concise** — short, focused clips work best for specific conversation moments.
+- **Review call recordings** after testing to identify where the transition between pre-recorded and dynamic audio can be improved.
--- a/2
+++ b/2
@ -1 +1 @@
-Subproject commit 3f566a4ba1e112255cc7459735bdb4b716948d59
+Subproject commit 2e2171e2a64ec87b3964fbc2440b5291489912a8
--- a/ui/AGENTS.md
+++ b/ui/AGENTS.md
@ -48,6 +48,12 @@ new api route in backend, and wish to use it in the UI, generate the client usin
 npm run generate-client
 ```

+## Conventions
+
+### File Uploads
+
+Always use a hidden `<input type="file">` with a visible `<Button>` that triggers it via `fileInputRef.current?.click()`. Never use a visible `<Input type="file">` — the native file input styling is inconsistent and confusing. Show the selected filename next to or below the button.
+
 ## Development

 ```bash
--- a/ui/src/app/superadmin/runs/page.tsx
+++ b/ui/src/app/superadmin/runs/page.tsx
@ -519,13 +519,17 @@ export default function RunsPage() {
                                                                variant="outline"
                                                                size="icon"
                                                                onClick={() => {
-                                                                    const filter = encodeURIComponent(
-                                                                        `metadata;stringObject;attributes;contains;conversation.id,metadata;stringObject;attributes;contains;${run.id}`,
-                                                                    );
-                                                                    window.open(
-                                                                        `${process.env.NEXT_PUBLIC_LANGFUSE_ENDPOINT}/project/${process.env.NEXT_PUBLIC_LANGFUSE_PROJECT_ID}/traces?search=&filter=${filter}&dateRange=All+time`,
-                                                                        '_blank',
-                                                                    );
+                                                                    if (run.gathered_context?.trace_url) {
+                                                                        window.open(String(run.gathered_context.trace_url), '_blank');
+                                                                    } else {
+                                                                        const filter = encodeURIComponent(
+                                                                            `metadata;stringObject;attributes;contains;conversation.id,metadata;stringObject;attributes;contains;${run.id}`,
+                                                                        );
+                                                                        window.open(
+                                                                            `${process.env.NEXT_PUBLIC_LANGFUSE_ENDPOINT}/project/${process.env.NEXT_PUBLIC_LANGFUSE_PROJECT_ID}/traces?search=&filter=${filter}&dateRange=All+time`,
+                                                                            '_blank',
+                                                                        );
+                                                                    }
                                                                }}
                                                            >
                                                                <Image
--- a/ui/src/app/workflow/[workflowId]/RenderWorkflow.tsx
+++ b/ui/src/app/workflow/[workflowId]/RenderWorkflow.tsx
@ -14,6 +14,7 @@ import type { DocumentResponseSchema, RecordingResponseSchema, ToolResponse } fr
 import { FlowEdge, FlowNode, NodeType } from "@/components/flow/types";
 import { Button } from '@/components/ui/button';
 import { Tooltip, TooltipContent, TooltipProvider, TooltipTrigger } from '@/components/ui/tooltip';
+import { useUserConfig } from '@/context/UserConfigContext';
 import { WorkflowConfigurations } from '@/types/workflow-configurations';

 import AddNodePanel from "../../../components/flow/AddNodePanel";
@ -64,6 +65,11 @@ interface RenderWorkflowProps {
 }

 function RenderWorkflow({ initialWorkflowName, workflowId, initialFlow, initialTemplateContextVariables, initialWorkflowConfigurations, user }: RenderWorkflowProps) {
+    const { userConfig } = useUserConfig();
+    const ttsProvider = (userConfig?.tts?.provider as string) ?? "";
+    const ttsModel = (userConfig?.tts?.model as string) ?? "";
+    const ttsVoiceId = (userConfig?.tts?.voice as string) ?? "";
+
    const [isContextVarsDialogOpen, setIsContextVarsDialogOpen] = useState(false);
    const [isConfigurationsDialogOpen, setIsConfigurationsDialogOpen] = useState(false);
    const [isDictionaryDialogOpen, setIsDictionaryDialogOpen] = useState(false);
@ -125,10 +131,15 @@ function RenderWorkflow({ initialWorkflowName, workflowId, initialFlow, initialT
                    setTools(toolsResponse.data);
                }

-                // Fetch recordings for this workflow
+                // Fetch recordings for this workflow filtered by active TTS config
                try {
                    const recordingsResponse = await listRecordingsApiV1WorkflowRecordingsGet({
-                        query: { workflow_id: workflowId },
+                        query: {
+                            workflow_id: workflowId,
+                            tts_provider: ttsProvider || undefined,
+                            tts_model: ttsModel || undefined,
+                            tts_voice_id: ttsVoiceId || undefined,
+                        },
                    });
                    if (recordingsResponse.data) {
                        setRecordings(recordingsResponse.data.recordings);
@ -142,7 +153,7 @@ function RenderWorkflow({ initialWorkflowName, workflowId, initialFlow, initialT
        };

        fetchData();
-    }, [workflowId]);
+    }, [workflowId, ttsProvider, ttsModel, ttsVoiceId]);

    // Memoize defaultEdgeOptions to prevent unnecessary re-renders
    const defaultEdgeOptions = useMemo(() => ({
--- a/ui/src/app/workflow/[workflowId]/components/RecordingsDialog.tsx
+++ b/ui/src/app/workflow/[workflowId]/components/RecordingsDialog.tsx
@ -1,11 +1,13 @@
-import { Loader2, Trash2Icon, Upload } from "lucide-react";
+import { Loader2, Mic, Pause, Play, Square, Trash2Icon, Upload } from "lucide-react";
 import { useCallback, useEffect, useRef, useState } from "react";

 import {
    createRecordingApiV1WorkflowRecordingsPost,
    deleteRecordingApiV1WorkflowRecordingsRecordingIdDelete,
+    getSignedUrlApiV1S3SignedUrlGet,
    getUploadUrlApiV1WorkflowRecordingsUploadUrlPost,
    listRecordingsApiV1WorkflowRecordingsGet,
+    transcribeAudioApiV1WorkflowRecordingsTranscribePost,
 } from "@/client";
 import type { RecordingResponseSchema } from "@/client/types.gen";
 import { Button } from "@/components/ui/button";
@ -18,6 +20,15 @@ import {
 } from "@/components/ui/dialog";
 import { Input } from "@/components/ui/input";
 import { Label } from "@/components/ui/label";
+import {
+    Select,
+    SelectContent,
+    SelectItem,
+    SelectTrigger,
+    SelectValue,
+} from "@/components/ui/select";
+import { Textarea } from "@/components/ui/textarea";
+import { LANGUAGE_DISPLAY_NAMES } from "@/constants/languages";
 import { useUserConfig } from "@/context/UserConfigContext";

 interface RecordingsDialogProps {
@ -29,6 +40,8 @@ interface RecordingsDialogProps {

 const MAX_FILE_SIZE = 5 * 1024 * 1024; // 5MB

+type RecordingStep = "idle" | "naming" | "recording" | "transcribing";
+
 export const RecordingsDialog = ({
    open,
    onOpenChange,
@ -42,7 +55,18 @@ export const RecordingsDialog = ({
    const [transcript, setTranscript] = useState("");
    const [selectedFile, setSelectedFile] = useState<File | null>(null);
    const [error, setError] = useState<string | null>(null);
+    const [language, setLanguage] = useState("multi");
+    const [recordingStep, setRecordingStep] = useState<RecordingStep>("idle");
+    const [recordingFilename, setRecordingFilename] = useState("");
+    const [recordingDuration, setRecordingDuration] = useState(0);
+    const [playingId, setPlayingId] = useState<string | null>(null);
+    const audioRef = useRef<HTMLAudioElement | null>(null);
+    const mediaRecorderRef = useRef<MediaRecorder | null>(null);
+    const audioChunksRef = useRef<Blob[]>([]);
+    const recordingTimerRef = useRef<ReturnType<typeof setInterval> | null>(null);
    const fileInputRef = useRef<HTMLInputElement>(null);
+    const languageRef = useRef(language);
+    languageRef.current = language;

    const ttsProvider = (userConfig?.tts?.provider as string) ?? "";
    const ttsModel = (userConfig?.tts?.model as string) ?? "";
@ -70,14 +94,128 @@ export const RecordingsDialog = ({
        }
    }, [workflowId, ttsProvider, ttsModel, ttsVoiceId, onRecordingsChange]);

+    const stopRecordingTimer = useCallback(() => {
+        if (recordingTimerRef.current) {
+            clearInterval(recordingTimerRef.current);
+            recordingTimerRef.current = null;
+        }
+    }, []);
+
+    const stopRecording = useCallback(() => {
+        if (mediaRecorderRef.current && mediaRecorderRef.current.state !== "inactive") {
+            mediaRecorderRef.current.stop();
+        }
+    }, []);
+
+    const resetRecordingState = useCallback(() => {
+        setRecordingStep("idle");
+        setRecordingFilename("");
+        setRecordingDuration(0);
+    }, []);
+
+    const stopPlayback = useCallback(() => {
+        if (audioRef.current) {
+            audioRef.current.pause();
+            audioRef.current = null;
+        }
+        setPlayingId(null);
+    }, []);
+
    useEffect(() => {
        if (open) {
            fetchRecordings();
            setError(null);
            setTranscript("");
            setSelectedFile(null);
+            setLanguage("multi");
+            resetRecordingState();
        }
-    }, [open, fetchRecordings]);
+    }, [open, fetchRecordings, resetRecordingState]);
+
+    useEffect(() => {
+        if (!open) {
+            stopRecording();
+            stopRecordingTimer();
+            stopPlayback();
+        }
+    }, [open, stopRecording, stopRecordingTimer, stopPlayback]);
+
+    const transcribeFile = async (file: File) => {
+        setRecordingStep("transcribing");
+        try {
+            const currentLang = languageRef.current;
+            const result = await transcribeAudioApiV1WorkflowRecordingsTranscribePost({
+                body: { file, language: currentLang },
+            });
+            const data = result.data as Record<string, unknown> | undefined;
+            if (data?.transcript) {
+                setTranscript(data.transcript as string);
+            }
+        } catch {
+            // Transcription failed — user can still type manually
+            setError("Auto-transcription failed. You can type the transcript manually.");
+        } finally {
+            setRecordingStep("idle");
+        }
+    };
+
+    const startRecording = async () => {
+        try {
+            const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
+            const mediaRecorder = new MediaRecorder(stream);
+            mediaRecorderRef.current = mediaRecorder;
+            audioChunksRef.current = [];
+
+            mediaRecorder.ondataavailable = (e) => {
+                if (e.data.size > 0) audioChunksRef.current.push(e.data);
+            };
+
+            const filename = recordingFilename.trim() || "recording";
+            mediaRecorder.onstop = () => {
+                stream.getTracks().forEach((t) => t.stop());
+                stopRecordingTimer();
+
+                const blob = new Blob(audioChunksRef.current, { type: mediaRecorder.mimeType });
+                if (blob.size > MAX_FILE_SIZE) {
+                    setError(`Recording (${(blob.size / (1024 * 1024)).toFixed(1)}MB) exceeds the maximum allowed size of 5MB.`);
+                    resetRecordingState();
+                    return;
+                }
+                const ext = mediaRecorder.mimeType.includes("webm") ? "webm" : "mp4";
+                const file = new File([blob], `${filename}.${ext}`, { type: mediaRecorder.mimeType });
+                setSelectedFile(file);
+                setError(null);
+                transcribeFile(file);
+            };
+
+            mediaRecorder.start();
+            setRecordingStep("recording");
+            setRecordingDuration(0);
+            setError(null);
+            recordingTimerRef.current = setInterval(() => {
+                setRecordingDuration((d) => d + 1);
+            }, 1000);
+        } catch {
+            setError("Microphone access denied. Please allow microphone permissions.");
+            resetRecordingState();
+        }
+    };
+
+    const handleStopRecording = () => {
+        stopRecording();
+    };
+
+    const handleFileSelect = (file: File | null) => {
+        if (file && file.size > MAX_FILE_SIZE) {
+            setError(`File size (${(file.size / (1024 * 1024)).toFixed(1)}MB) exceeds the maximum allowed size of 5MB.`);
+            setSelectedFile(null);
+            if (fileInputRef.current) fileInputRef.current.value = "";
+            return;
+        }
+        setError(null);
+        setSelectedFile(file);
+        if (file) transcribeFile(file);
+    };

    const handleUpload = async () => {
        if (!selectedFile || !transcript.trim()) return;
@ -137,6 +275,7 @@ export const RecordingsDialog = ({
                        original_filename: selectedFile.name,
                        file_size_bytes: selectedFile.size,
                        mime_type: selectedFile.type,
+                        language,
                    },
                },
            });
@ -144,6 +283,8 @@ export const RecordingsDialog = ({
            // Reset form and refresh list
            setTranscript("");
            setSelectedFile(null);
+            setLanguage("multi");
+            resetRecordingState();
            if (fileInputRef.current) fileInputRef.current.value = "";
            await fetchRecordings();
        } catch (err) {
@ -166,13 +307,44 @@ export const RecordingsDialog = ({
        }
    };

+    const handlePlay = async (rec: RecordingResponseSchema) => {
+        if (playingId === rec.recording_id) {
+            stopPlayback();
+            return;
+        }
+        stopPlayback();
+        try {
+            const result = await getSignedUrlApiV1S3SignedUrlGet({
+                query: {
+                    key: rec.storage_key,
+                    storage_backend: rec.storage_backend,
+                },
+            });
+            if (!result.data?.url) {
+                setError("Failed to get audio URL");
+                return;
+            }
+            const audio = new Audio(result.data.url);
+            audio.onended = () => setPlayingId(null);
+            audioRef.current = audio;
+            setPlayingId(rec.recording_id);
+            await audio.play();
+        } catch {
+            setError("Failed to play recording");
+        }
+    };
+
+    const isRecording = recordingStep === "recording";
+    const isTranscribing = recordingStep === "transcribing";
+    const isBusy = uploading || isRecording || isTranscribing;
+
    return (
        <Dialog open={open} onOpenChange={onOpenChange}>
            <DialogContent className="max-w-lg max-h-[80vh] overflow-y-auto">
                <DialogHeader>
                    <DialogTitle>Workflow Recordings</DialogTitle>
                    <DialogDescription>
-                        Upload audio recordings for hybrid prompts. Recordings are
+                        Upload or record audio for hybrid prompts. Recordings are
                        scoped to your current TTS configuration. Use{" "}
                        <code className="text-xs bg-muted px-1 rounded">@</code> in
                        prompt fields to insert them.
@ -211,48 +383,158 @@ export const RecordingsDialog = ({

                {/* Upload Section */}
                <div className="space-y-3 border rounded-md p-3">
-                    <Label className="text-sm font-medium">Upload New Recording</Label>
+                    <Label className="text-sm font-medium">Add New Recording</Label>
+
+                    {/* Audio source: file picker or record */}
                    <div>
                        <Label className="text-xs text-muted-foreground">
                            Audio File
                        </Label>
-                        <Input
-                            ref={fileInputRef}
-                            type="file"
-                            accept="audio/*"
-                            onChange={(e) => {
-                                const file = e.target.files?.[0] ?? null;
-                                if (file && file.size > MAX_FILE_SIZE) {
-                                    setError(
-                                        `File size (${(file.size / (1024 * 1024)).toFixed(1)}MB) exceeds the maximum allowed size of 5MB.`
-                                    );
-                                    setSelectedFile(null);
-                                    if (fileInputRef.current) fileInputRef.current.value = "";
-                                    return;
-                                }
-                                setError(null);
-                                setSelectedFile(file);
-                            }}
-                            className="text-sm"
-                        />
-                        <p className="text-xs text-muted-foreground mt-1">
-                            Max 5MB
-                        </p>
+                        <div className="flex gap-2">
+                            <input
+                                ref={fileInputRef}
+                                type="file"
+                                accept="audio/*"
+                                onChange={(e) => handleFileSelect(e.target.files?.[0] ?? null)}
+                                className="hidden"
+                            />
+                            <Button
+                                type="button"
+                                variant="outline"
+                                size="sm"
+                                className="flex-1 justify-start text-sm font-normal"
+                                onClick={() => fileInputRef.current?.click()}
+                                disabled={isBusy}
+                            >
+                                <Upload className="w-4 h-4 mr-2 shrink-0" />
+                                {selectedFile && recordingStep !== "naming" ? (
+                                    <span className="truncate">
+                                        {selectedFile.name} ({(selectedFile.size / (1024 * 1024)).toFixed(1)}MB)
+                                    </span>
+                                ) : (
+                                    <span className="text-muted-foreground">Choose audio file (max 5MB)</span>
+                                )}
+                            </Button>
+                            {recordingStep === "idle" && (
+                                <Button
+                                    type="button"
+                                    variant="outline"
+                                    size="sm"
+                                    onClick={() => setRecordingStep("naming")}
+                                    disabled={uploading || isTranscribing}
+                                >
+                                    <Mic className="w-4 h-4 mr-1" />
+                                    Record
+                                </Button>
+                            )}
+                        </div>
                    </div>
+
+                    {/* Recording: filename + start/stop */}
+                    {(recordingStep === "naming" || isRecording) && (
+                        <div className="space-y-2 rounded-md border border-dashed p-3 bg-muted/20">
+                            {recordingStep === "naming" && (
+                                <>
+                                    <div>
+                                        <Label className="text-xs text-muted-foreground">
+                                            Recording Name
+                                        </Label>
+                                        <Input
+                                            placeholder="e.g. greeting, hold-message"
+                                            value={recordingFilename}
+                                            onChange={(e) => setRecordingFilename(e.target.value)}
+                                            autoFocus
+                                        />
+                                    </div>
+                                    <div className="flex gap-2">
+                                        <Button
+                                            size="sm"
+                                            onClick={startRecording}
+                                            disabled={!recordingFilename.trim()}
+                                        >
+                                            <Mic className="w-4 h-4 mr-1" />
+                                            Start Recording
+                                        </Button>
+                                        <Button
+                                            size="sm"
+                                            variant="ghost"
+                                            onClick={resetRecordingState}
+                                        >
+                                            Cancel
+                                        </Button>
+                                    </div>
+                                </>
+                            )}
+                            {isRecording && (
+                                <div className="flex items-center gap-3">
+                                    <span className="relative flex h-3 w-3">
+                                        <span className="animate-ping absolute inline-flex h-full w-full rounded-full bg-red-400 opacity-75" />
+                                        <span className="relative inline-flex rounded-full h-3 w-3 bg-red-500" />
+                                    </span>
+                                    <span className="text-sm font-mono">
+                                        {Math.floor(recordingDuration / 60)}:{(recordingDuration % 60).toString().padStart(2, "0")}
+                                    </span>
+                                    <span className="text-xs text-muted-foreground">{recordingFilename}</span>
+                                    <Button
+                                        size="sm"
+                                        variant="destructive"
+                                        onClick={handleStopRecording}
+                                        className="ml-auto"
+                                    >
+                                        <Square className="w-4 h-4 mr-1" />
+                                        Stop
+                                    </Button>
+                                </div>
+                            )}
+                        </div>
+                    )}
+
+                    {/* Transcribing progress */}
+                    {isTranscribing && (
+                        <div className="flex items-center gap-2 text-sm text-muted-foreground">
+                            <Loader2 className="w-4 h-4 animate-spin" />
+                            Transcribing audio...
+                        </div>
+                    )}
+
+                    {/* Language */}
+                    <div>
+                        <Label className="text-xs text-muted-foreground">
+                            Language
+                        </Label>
+                        <Select value={language} onValueChange={setLanguage}>
+                            <SelectTrigger className="h-9 text-sm">
+                                <SelectValue />
+                            </SelectTrigger>
+                            <SelectContent>
+                                {Object.entries(LANGUAGE_DISPLAY_NAMES).map(([code, name]) => (
+                                    <SelectItem key={code} value={code}>
+                                        {name}
+                                    </SelectItem>
+                                ))}
+                            </SelectContent>
+                        </Select>
+                    </div>
+
+                    {/* Transcript */}
                    <div>
                        <Label className="text-xs text-muted-foreground">
                            Transcript
                        </Label>
-                        <Input
-                            placeholder="What does this recording say?"
+                        <Textarea
+                            placeholder={isTranscribing ? "Transcribing..." : "What does this recording say?"}
                            value={transcript}
                            onChange={(e) => setTranscript(e.target.value)}
+                            disabled={isTranscribing}
+                            rows={3}
+                            className="resize-none text-sm"
                        />
                    </div>
+
                    <Button
                        size="sm"
                        onClick={handleUpload}
-                        disabled={!selectedFile || !transcript.trim() || uploading}
+                        disabled={!selectedFile || !transcript.trim() || isBusy}
                    >
                        {uploading ? (
                            <Loader2 className="w-4 h-4 mr-1 animate-spin" />
@ -289,14 +571,25 @@ export const RecordingsDialog = ({
                            >
                                <div className="flex-1 min-w-0">
                                    <div className="flex items-center gap-2">
-                                        <code className="text-xs bg-muted px-1.5 py-0.5 rounded font-mono">
-                                            {rec.recording_id}
+                                        <code className="text-xs bg-muted px-1.5 py-0.5 rounded font-mono truncate max-w-[300px]">
+                                            {(rec.metadata?.original_filename as string) || rec.recording_id}
                                        </code>
                                    </div>
                                    <p className="text-sm text-muted-foreground mt-1 break-all line-clamp-2">
                                        {rec.transcript}
                                    </p>
                                </div>
+                                <Button
+                                    size="sm"
+                                    variant="ghost"
+                                    onClick={() => handlePlay(rec)}
+                                >
+                                    {playingId === rec.recording_id ? (
+                                        <Pause className="w-4 h-4" />
+                                    ) : (
+                                        <Play className="w-4 h-4" />
+                                    )}
+                                </Button>
                                <Button
                                    size="sm"
                                    variant="ghost"
--- a/ui/src/app/workflow/[workflowId]/hooks/useWorkflowState.ts
+++ b/ui/src/app/workflow/[workflowId]/hooks/useWorkflowState.ts
@ -363,7 +363,13 @@ export const useWorkflowState = ({
    // Save workflow function
    const saveWorkflow = useCallback(async (updateWorkflowDefinition: boolean = true) => {
        if (!user || !rfInstance.current) return;
-        const flow = rfInstance.current.toObject();
+        // Read nodes/edges from the Zustand store (synchronously up-to-date)
+        // and viewport from the ReactFlow instance to build the flow object.
+        // This avoids a race condition where rfInstance.toObject() may return
+        // stale node data if React hasn't re-rendered yet after a store update.
+        const { nodes: currentNodes, edges: currentEdges } = useWorkflowStore.getState();
+        const viewport = rfInstance.current.getViewport();
+        const flow = { nodes: currentNodes, edges: currentEdges, viewport };
        try {
            await updateWorkflowApiV1WorkflowWorkflowIdPut({
                path: {
--- a/ui/src/app/workflow/[workflowId]/utils/layoutNodes.ts
+++ b/ui/src/app/workflow/[workflowId]/utils/layoutNodes.ts
@ -18,14 +18,12 @@ export const layoutNodes = (
    // Separate nodes by type
    const triggerNodes = nodes.filter(n => n.type === NodeType.TRIGGER);
    const webhookNodes = nodes.filter(n => n.type === NodeType.WEBHOOK);
-    const globalNodes = nodes.filter(n => n.type === NodeType.GLOBAL_NODE || n.type === 'global');
+    const qaNodes = nodes.filter(n => n.type === NodeType.QA);
+    const globalNodes = nodes.filter(n => n.type === NodeType.GLOBAL_NODE);
    const workflowNodes = nodes.filter(n =>
        n.type === NodeType.START_CALL ||
        n.type === NodeType.AGENT_NODE ||
-        n.type === NodeType.END_CALL ||
-        n.type === 'startCall' ||
-        n.type === 'agentNode' ||
-        n.type === 'endCall'
+        n.type === NodeType.END_CALL
    );

    // If no workflow nodes, just return original nodes
@ -161,12 +159,26 @@ export const layoutNodes = (
        };
    });

+    // Position QA nodes below webhook nodes on the right side
+    const qaStartY = webhookNodes.length > 0
+        ? workflowCenterY - (webhookNodes.length * NODE_HEIGHT + (webhookNodes.length - 1) * VERTICAL_SPACING) / 2
+            + webhookNodes.length * (NODE_HEIGHT + VERTICAL_SPACING) + VERTICAL_SPACING
+        : workflowCenterY;
+    const positionedQaNodes = qaNodes.map((node, index) => ({
+        ...node,
+        position: {
+            x: webhookNodesX,
+            y: qaStartY + index * (NODE_HEIGHT + VERTICAL_SPACING)
+        }
+    }));
+
    // Combine all positioned nodes
    const allPositionedNodes = [
        ...positionedTriggerNodes,
        ...positionedGlobalNodes,
        ...positionedWorkflowNodes,
-        ...positionedWebhookNodes
+        ...positionedWebhookNodes,
+        ...positionedQaNodes
    ];

    // Create a map for quick lookup
--- a/ui/src/client/sdk.gen.ts
+++ b/ui/src/client/sdk.gen.ts
--- a/ui/src/client/types.gen.ts
+++ b/ui/src/client/types.gen.ts
@ -80,6 +80,11 @@ export type AuthUserResponse = {
    is_superuser: boolean;
 };

+export type BodyTranscribeAudioApiV1WorkflowRecordingsTranscribePost = {
+    file: Blob | File;
+    language?: string;
+};
+
 export type CallDispositionCodes = {
    disposition_codes?: Array<string>;
 };
@ -4019,6 +4024,10 @@ export type GetSignedUrlApiV1S3SignedUrlGetData = {
        key: string;
        expires_in?: number;
        inline?: boolean;
+        /**
+         * Storage backend to use (e.g. 'minio', 's3'). When omitted the backend is inferred from the resource.
+         */
+        storage_backend?: string | null;
    };
    url: '/api/v1/s3/signed-url';
 };
@ -5568,6 +5577,37 @@ export type DeleteRecordingApiV1WorkflowRecordingsRecordingIdDeleteResponses = {
    200: unknown;
 };

+export type TranscribeAudioApiV1WorkflowRecordingsTranscribePostData = {
+    body: BodyTranscribeAudioApiV1WorkflowRecordingsTranscribePost;
+    headers?: {
+        authorization?: string | null;
+        'X-API-Key'?: string | null;
+    };
+    path?: never;
+    query?: never;
+    url: '/api/v1/workflow-recordings/transcribe';
+};
+
+export type TranscribeAudioApiV1WorkflowRecordingsTranscribePostErrors = {
+    /**
+     * Not found
+     */
+    404: unknown;
+    /**
+     * Validation Error
+     */
+    422: HttpValidationError;
+};
+
+export type TranscribeAudioApiV1WorkflowRecordingsTranscribePostError = TranscribeAudioApiV1WorkflowRecordingsTranscribePostErrors[keyof TranscribeAudioApiV1WorkflowRecordingsTranscribePostErrors];
+
+export type TranscribeAudioApiV1WorkflowRecordingsTranscribePostResponses = {
+    /**
+     * Successful Response
+     */
+    200: unknown;
+};
+
 export type SignupApiV1AuthSignupPostData = {
    body: SignupRequest;
    path?: never;
--- a/ui/src/components/ServiceConfiguration.tsx
+++ b/ui/src/components/ServiceConfiguration.tsx
@ -13,6 +13,7 @@ import { Label } from "@/components/ui/label";
 import { Select, SelectContent, SelectItem, SelectTrigger, SelectValue } from "@/components/ui/select";
 import { Tabs, TabsContent, TabsList, TabsTrigger } from "@/components/ui/tabs";
 import { VoiceSelector } from "@/components/VoiceSelector";
+import { LANGUAGE_DISPLAY_NAMES } from "@/constants/languages";
 import { useUserConfig } from "@/context/UserConfigContext";

 type ServiceSegment = "llm" | "tts" | "stt" | "embeddings";
@ -46,105 +47,6 @@ const TAB_CONFIG: { key: ServiceSegment; label: string }[] = [
    { key: "embeddings", label: "Embedding" },
 ];

-// Display names for language codes (Deepgram + Sarvam)
-const LANGUAGE_DISPLAY_NAMES: Record<string, string> = {
-    "multi": "Multilingual (Auto-detect)",
-    // Arabic
-    "ar": "Arabic",
-    "ar-AE": "Arabic (UAE)",
-    "ar-SA": "Arabic (Saudi Arabia)",
-    "ar-QA": "Arabic (Qatar)",
-    "ar-KW": "Arabic (Kuwait)",
-    "ar-SY": "Arabic (Syria)",
-    "ar-LB": "Arabic (Lebanon)",
-    "ar-PS": "Arabic (Palestine)",
-    "ar-JO": "Arabic (Jordan)",
-    "ar-EG": "Arabic (Egypt)",
-    "ar-SD": "Arabic (Sudan)",
-    "ar-TD": "Arabic (Chad)",
-    "ar-MA": "Arabic (Morocco)",
-    "ar-DZ": "Arabic (Algeria)",
-    "ar-TN": "Arabic (Tunisia)",
-    "ar-IQ": "Arabic (Iraq)",
-    "ar-IR": "Arabic (Iran)",
-    // Other languages
-    "be": "Belarusian",
-    "bn": "Bengali",
-    "bs": "Bosnian",
-    "bg": "Bulgarian",
-    "ca": "Catalan",
-    "cs": "Czech",
-    "da": "Danish",
-    "da-DK": "Danish (Denmark)",
-    "de": "German",
-    "de-CH": "German (Switzerland)",
-    "el": "Greek",
-    "en": "English",
-    "en-US": "English (US)",
-    "en-AU": "English (Australia)",
-    "en-GB": "English (UK)",
-    "en-IN": "English (India)",
-    "en-NZ": "English (New Zealand)",
-    "es": "Spanish",
-    "es-419": "Spanish (Latin America)",
-    "et": "Estonian",
-    "fa": "Persian",
-    "fi": "Finnish",
-    "fr": "French",
-    "fr-CA": "French (Canada)",
-    "he": "Hebrew",
-    "hi": "Hindi",
-    "hr": "Croatian",
-    "hu": "Hungarian",
-    "id": "Indonesian",
-    "it": "Italian",
-    "ja": "Japanese",
-    "kn": "Kannada",
-    "ko": "Korean",
-    "ko-KR": "Korean (South Korea)",
-    "lt": "Lithuanian",
-    "lv": "Latvian",
-    "mk": "Macedonian",
-    "mr": "Marathi",
-    "ms": "Malay",
-    "nl": "Dutch",
-    "nl-BE": "Flemish",
-    "no": "Norwegian",
-    "pl": "Polish",
-    "pt": "Portuguese",
-    "pt-BR": "Portuguese (Brazil)",
-    "pt-PT": "Portuguese (Portugal)",
-    "ro": "Romanian",
-    "ru": "Russian",
-    "sk": "Slovak",
-    "sl": "Slovenian",
-    "sr": "Serbian",
-    "sv": "Swedish",
-    "sv-SE": "Swedish (Sweden)",
-    "ta": "Tamil",
-    "te": "Telugu",
-    "th": "Thai",
-    "tl": "Tagalog",
-    "tr": "Turkish",
-    "uk": "Ukrainian",
-    "ur": "Urdu",
-    "vi": "Vietnamese",
-    "zh-CN": "Chinese (Simplified)",
-    "zh-TW": "Chinese (Traditional)",
-    // Sarvam Indian languages
-    "bn-IN": "Bengali",
-    "gu-IN": "Gujarati",
-    "hi-IN": "Hindi",
-    "kn-IN": "Kannada",
-    "ml-IN": "Malayalam",
-    "mr-IN": "Marathi",
-    "od-IN": "Odia",
-    "pa-IN": "Punjabi",
-    "ta-IN": "Tamil",
-    "te-IN": "Telugu",
-    "as-IN": "Assamese",
-};
-
 // Display names for Sarvam voices
 const VOICE_DISPLAY_NAMES: Record<string, string> = {
    "anushka": "Anushka (Female)",
@ -236,11 +138,21 @@ export default function ServiceConfiguration() {
                        }
                    });
                    selectedProviders[service] = userConfig?.[service]?.provider as string;
+                    // Fill in schema defaults for fields not present in userConfig
+                    const properties = response.data[service]?.[selectedProviders[service]]?.properties as Record<string, SchemaProperty>;
+                    if (properties) {
+                        Object.entries(properties).forEach(([field, schema]) => {
+                            const key = `${service}_${field}`;
+                            if (field !== "provider" && field !== "api_key" && schema.default !== undefined && !(key in defaultValues)) {
+                                defaultValues[key] = schema.default;
+                            }
+                        });
+                    }
                } else {
                    const properties = response.data[service]?.[selectedProviders[service]]?.properties as Record<string, SchemaProperty>;
                    if (properties) {
                        Object.entries(properties).forEach(([field, schema]) => {
-                            if (field !== "provider" && schema.default) {
+                            if (field !== "provider" && schema.default !== undefined) {
                                defaultValues[`${service}_${field}`] = schema.default;
                            }
                        });
--- a/ui/src/components/flow/MentionTextarea.tsx
+++ b/ui/src/components/flow/MentionTextarea.tsx
@ -15,6 +15,7 @@ export interface MentionItem {
    id: string;
    name: string;
    description: string;
+    filename: string;
 }

 interface MentionTextareaProps {
@ -46,6 +47,7 @@ export function MentionTextarea({
                id: r.recording_id,
                name: r.transcript,
                description: r.transcript,
+                filename: (r.metadata?.original_filename as string) || r.recording_id,
            })),
        [recordings]
    );
@ -195,7 +197,7 @@ export function MentionTextarea({
                        >
                            <div className="flex items-center gap-2">
                                <code className="text-xs bg-muted px-1 py-0.5 rounded font-mono">
-                                    {item.id}
+                                    {item.filename}
                                </code>
                                <span className="font-medium truncate">{item.name}</span>
                            </div>
--- a/ui/src/components/flow/edges/CustomEdge.tsx
+++ b/ui/src/components/flow/edges/CustomEdge.tsx
@ -215,11 +215,7 @@ export default function CustomEdge(props: CustomEdgeProps) {
    const handleSaveEdgeData = useCallback(async (updatedData: FlowEdgeData) => {
        // Use the workflow store's updateEdge method to properly track history
        updateEdge(id, { data: updatedData });
-
-        // Save the workflow after updating edge data with a small delay to ensure state is updated
-        setTimeout(async () => {
-            await saveWorkflow();
-        }, 100);
+        await saveWorkflow();
    }, [id, updateEdge, saveWorkflow]);

    return (
--- a/ui/src/components/flow/nodes/AgentNode.tsx
+++ b/ui/src/components/flow/nodes/AgentNode.tsx
@ -89,10 +89,7 @@ export const AgentNode = memo(({ data, selected, id }: AgentNodeProps) => {
            document_uuids: documentUuids.length > 0 ? documentUuids : undefined,
        });
        setOpen(false);
-        // Save the workflow after updating node data with a small delay to ensure state is updated
-        setTimeout(async () => {
-            await saveWorkflow();
-        }, 100);
+        await saveWorkflow();
    };

    // Reset form state when dialog opens
@ -127,27 +124,23 @@ export const AgentNode = memo(({ data, selected, id }: AgentNodeProps) => {
    }, [data, open]);

    // Handle cleanup of stale document UUIDs
-    const handleStaleDocuments = useCallback((staleUuids: string[]) => {
+    const handleStaleDocuments = useCallback(async (staleUuids: string[]) => {
        const cleanedUuids = (data.document_uuids ?? []).filter(uuid => !staleUuids.includes(uuid));
        handleSaveNodeData({
            ...data,
            document_uuids: cleanedUuids.length > 0 ? cleanedUuids : undefined,
        });
-        setTimeout(async () => {
-            await saveWorkflow();
-        }, 100);
+        await saveWorkflow();
    }, [data, handleSaveNodeData, saveWorkflow]);

    // Handle cleanup of stale tool UUIDs
-    const handleStaleTools = useCallback((staleUuids: string[]) => {
+    const handleStaleTools = useCallback(async (staleUuids: string[]) => {
        const cleanedUuids = (data.tool_uuids ?? []).filter(uuid => !staleUuids.includes(uuid));
        handleSaveNodeData({
            ...data,
            tool_uuids: cleanedUuids.length > 0 ? cleanedUuids : undefined,
        });
-        setTimeout(async () => {
-            await saveWorkflow();
-        }, 100);
+        await saveWorkflow();
    }, [data, handleSaveNodeData, saveWorkflow]);

    return (
--- a/ui/src/components/flow/nodes/EndCall.tsx
+++ b/ui/src/components/flow/nodes/EndCall.tsx
@ -75,10 +75,7 @@ export const EndCall = memo(({ data, selected, id }: EndCallNodeProps) => {
            add_global_prompt: addGlobalPrompt,
        });
        setOpen(false);
-        // Save the workflow after updating node data with a small delay to ensure state is updated
-        setTimeout(async () => {
-            await saveWorkflow();
-        }, 100);
+        await saveWorkflow();
    };

    // Reset form state when dialog opens
--- a/ui/src/components/flow/nodes/GlobalNode.tsx
+++ b/ui/src/components/flow/nodes/GlobalNode.tsx
@ -52,10 +52,7 @@ export const GlobalNode = memo(({ data, selected, id }: GlobalNodeProps) => {
            name
        });
        setOpen(false);
-        // Save the workflow after updating node data with a small delay to ensure state is updated
-        setTimeout(async () => {
-            await saveWorkflow();
-        }, 100);
+        await saveWorkflow();
    };

    // Reset form state when dialog opens
--- a/ui/src/components/flow/nodes/QANode.tsx
+++ b/ui/src/components/flow/nodes/QANode.tsx
@ -66,9 +66,7 @@ export const QANode = memo(({ data, selected, id }: QANodeProps) => {
            qa_sample_rate: qaSampleRate,
        });
        setOpen(false);
-        setTimeout(async () => {
-            await saveWorkflow();
-        }, 100);
+        await saveWorkflow();
    };

    const resetFormState = () => {
--- a/ui/src/components/flow/nodes/StartCall.tsx
+++ b/ui/src/components/flow/nodes/StartCall.tsx
@ -104,10 +104,7 @@ export const StartCall = memo(({ data, selected, id }: StartCallNodeProps) => {
            document_uuids: documentUuids.length > 0 ? documentUuids : undefined,
        });
        setOpen(false);
-        // Save the workflow after updating node data with a small delay to ensure state is updated
-        setTimeout(async () => {
-            await saveWorkflow();
-        }, 100);
+        await saveWorkflow();
    };

    // Reset form state when dialog opens
@ -148,27 +145,23 @@ export const StartCall = memo(({ data, selected, id }: StartCallNodeProps) => {
    }, [data, open]);

    // Handle cleanup of stale document UUIDs
-    const handleStaleDocuments = useCallback((staleUuids: string[]) => {
+    const handleStaleDocuments = useCallback(async (staleUuids: string[]) => {
        const cleanedUuids = (data.document_uuids ?? []).filter(uuid => !staleUuids.includes(uuid));
        handleSaveNodeData({
            ...data,
            document_uuids: cleanedUuids.length > 0 ? cleanedUuids : undefined,
        });
-        setTimeout(async () => {
-            await saveWorkflow();
-        }, 100);
+        await saveWorkflow();
    }, [data, handleSaveNodeData, saveWorkflow]);

    // Handle cleanup of stale tool UUIDs
-    const handleStaleTools = useCallback((staleUuids: string[]) => {
+    const handleStaleTools = useCallback(async (staleUuids: string[]) => {
        const cleanedUuids = (data.tool_uuids ?? []).filter(uuid => !staleUuids.includes(uuid));
        handleSaveNodeData({
            ...data,
            tool_uuids: cleanedUuids.length > 0 ? cleanedUuids : undefined,
        });
-        setTimeout(async () => {
-            await saveWorkflow();
-        }, 100);
+        await saveWorkflow();
    }, [data, handleSaveNodeData, saveWorkflow]);

    return (
--- a/ui/src/components/flow/nodes/TriggerNode.tsx
+++ b/ui/src/components/flow/nodes/TriggerNode.tsx
@ -61,10 +61,7 @@ export const TriggerNode = memo(({ data, selected, id }: TriggerNodeProps) => {
            trigger_path: triggerPath,
        });
        setOpen(false);
-        // Save the workflow after updating node data
-        setTimeout(async () => {
-            await saveWorkflow();
-        }, 100);
+        await saveWorkflow();
    };

    // Reset form state when dialog opens
--- a/ui/src/components/flow/nodes/WebhookNode.tsx
+++ b/ui/src/components/flow/nodes/WebhookNode.tsx
@ -86,9 +86,7 @@ export const WebhookNode = memo(({ data, selected, id }: WebhookNodeProps) => {
            payload_template: validation.parsed as Record<string, unknown>,
        });
        setOpen(false);
-        setTimeout(async () => {
-            await saveWorkflow();
-        }, 100);
+        await saveWorkflow();
    };

    const handleOpenChange = (newOpen: boolean) => {
--- a/ui/src/components/flow/nodes/common/NodeEditDialog.tsx
+++ b/ui/src/components/flow/nodes/common/NodeEditDialog.tsx
@ -71,12 +71,13 @@ export const NodeEditDialog = ({
        const handleKeyDown = (e: KeyboardEvent) => {
            if ((e.metaKey || e.ctrlKey) && e.key === 's') {
                e.preventDefault();
+                e.stopImmediatePropagation();
                handleSave();
            }
        };

-        window.addEventListener('keydown', handleKeyDown);
-        return () => window.removeEventListener('keydown', handleKeyDown);
+        window.addEventListener('keydown', handleKeyDown, true);
+        return () => window.removeEventListener('keydown', handleKeyDown, true);
    }, [open, handleSave]);

    return (
--- a/ui/src/components/workflow/CreateWorkflowButton.tsx
+++ b/ui/src/components/workflow/CreateWorkflowButton.tsx
@ -1,22 +1,114 @@
 'use client';

-import { PlusIcon } from 'lucide-react';
+import { Bot, ChevronDown, LayoutTemplate, PlusIcon } from 'lucide-react';
 import { useRouter } from 'next/navigation';
+import { useState } from 'react';
+import { toast } from 'sonner';

+import { createWorkflowApiV1WorkflowCreateDefinitionPost } from '@/client/sdk.gen';
 import { Button } from "@/components/ui/button";
+import {
+    DropdownMenu,
+    DropdownMenuContent,
+    DropdownMenuItem,
+    DropdownMenuTrigger,
+} from "@/components/ui/dropdown-menu";
+import { useAuth } from '@/lib/auth';
+import logger from '@/lib/logger';
+import { getRandomId } from '@/lib/utils';
+
+const BLANK_WORKFLOW_DEFINITION = {
+    nodes: [
+        {
+            id: "1",
+            type: "startCall",
+            position: { x: 175, y: 60 },
+            data: {
+                prompt: "# Goal\nYou are a helpful agent who is handing a conversation over voice with a human. This is a voice conversation, so transcripts can be error prone.\n\n## Rules\n- Language: UK English but does not have to be correct english\n- Keep responses short and 2-3 sentences max\n- If you have to repeat something that you said in your previous two turns, then rephrase a bit while keeping the same meaning. Never repeat the exact same words as in your previous 2 responses.\n\n## Speech Handling\n- There could be multiple transcription errors. \n- Accept variations: yes/yeah/yep/aye, no/nah/nope\n- If user says \"sorry?\" or \"pardon me\" or \"can you repeat\"  or \"what?\", they might not have heard you- so just repeat what you just said.\n\n### Flow\nStart by saying \"Hi\". Be polite and courteous. ",
+                name: "start call",
+                allow_interrupt: false,
+                invalid: false,
+                validationMessage: null,
+                is_static: false,
+                add_global_prompt: false,
+                wait_for_user_response: false,
+                detect_voicemail: true,
+                delayed_start: false,
+                is_start: true,
+                selected_through_edge: false,
+                hovered_through_edge: false,
+                extraction_enabled: false,
+                selected: false,
+                dragging: false,
+            },
+        },
+    ],
+    edges: [],
+    viewport: { x: 808, y: 269, zoom: 0.75 },
+};

 export function CreateWorkflowButton() {
    const router = useRouter();
-    const handleClick = () => {
+    const { user, getAccessToken } = useAuth();
+    const [isCreating, setIsCreating] = useState(false);
+
+    const handleAgentBuilder = () => {
        router.push('/workflow/create');
    };

+    const handleBlankCanvas = async () => {
+        if (isCreating || !user) return;
+        setIsCreating(true);
+
+        try {
+            const accessToken = await getAccessToken();
+            const name = `Workflow-${getRandomId()}`;
+            const response = await createWorkflowApiV1WorkflowCreateDefinitionPost({
+                body: {
+                    name,
+                    workflow_definition: BLANK_WORKFLOW_DEFINITION as unknown as { [key: string]: unknown },
+                },
+                headers: {
+                    'Authorization': `Bearer ${accessToken}`,
+                },
+            });
+
+            if (response.data?.id) {
+                router.push(`/workflow/${response.data.id}`);
+            }
+        } catch (err) {
+            logger.error(`Error creating blank workflow: ${err}`);
+            toast.error('Failed to create workflow');
+        } finally {
+            setIsCreating(false);
+        }
+    };
+
    return (
-        <Button
-            onClick={handleClick}
-        >
-            <PlusIcon className="w-4 h-4" />
-            Create Agent
-        </Button>
+        <DropdownMenu>
+            <DropdownMenuTrigger asChild>
+                <Button disabled={isCreating}>
+                    <PlusIcon className="w-4 h-4" />
+                    {isCreating ? 'Creating...' : 'Create Agent'}
+                    <ChevronDown className="w-4 h-4" />
+                </Button>
+            </DropdownMenuTrigger>
+            <DropdownMenuContent align="end">
+                <DropdownMenuItem onClick={handleAgentBuilder} className="cursor-pointer">
+                    <Bot className="w-4 h-4 mr-2" />
+                    <div>
+                        <div className="font-medium">Use Agent Builder</div>
+                        <div className="text-xs text-muted-foreground">AI generates a workflow from your description</div>
+                    </div>
+                </DropdownMenuItem>
+                <DropdownMenuItem onClick={handleBlankCanvas} disabled={isCreating} className="cursor-pointer">
+                    <LayoutTemplate className="w-4 h-4 mr-2" />
+                    <div>
+                        <div className="font-medium">Blank Canvas</div>
+                        <div className="text-xs text-muted-foreground">Start from scratch with an empty workflow</div>
+                    </div>
+                </DropdownMenuItem>
+            </DropdownMenuContent>
+        </DropdownMenu>
    );
 }
--- a/ui/src/constants/languages.ts
+++ b/ui/src/constants/languages.ts
@ -0,0 +1,98 @@
+// Display names for language codes (Deepgram + Sarvam)
+export const LANGUAGE_DISPLAY_NAMES: Record<string, string> = {
+    "multi": "Multilingual (Auto-detect)",
+    // Arabic
+    "ar": "Arabic",
+    "ar-AE": "Arabic (UAE)",
+    "ar-SA": "Arabic (Saudi Arabia)",
+    "ar-QA": "Arabic (Qatar)",
+    "ar-KW": "Arabic (Kuwait)",
+    "ar-SY": "Arabic (Syria)",
+    "ar-LB": "Arabic (Lebanon)",
+    "ar-PS": "Arabic (Palestine)",
+    "ar-JO": "Arabic (Jordan)",
+    "ar-EG": "Arabic (Egypt)",
+    "ar-SD": "Arabic (Sudan)",
+    "ar-TD": "Arabic (Chad)",
+    "ar-MA": "Arabic (Morocco)",
+    "ar-DZ": "Arabic (Algeria)",
+    "ar-TN": "Arabic (Tunisia)",
+    "ar-IQ": "Arabic (Iraq)",
+    "ar-IR": "Arabic (Iran)",
+    // Other languages
+    "be": "Belarusian",
+    "bn": "Bengali",
+    "bs": "Bosnian",
+    "bg": "Bulgarian",
+    "ca": "Catalan",
+    "cs": "Czech",
+    "da": "Danish",
+    "da-DK": "Danish (Denmark)",
+    "de": "German",
+    "de-CH": "German (Switzerland)",
+    "el": "Greek",
+    "en": "English",
+    "en-US": "English (US)",
+    "en-AU": "English (Australia)",
+    "en-GB": "English (UK)",
+    "en-IN": "English (India)",
+    "en-NZ": "English (New Zealand)",
+    "es": "Spanish",
+    "es-419": "Spanish (Latin America)",
+    "et": "Estonian",
+    "fa": "Persian",
+    "fi": "Finnish",
+    "fr": "French",
+    "fr-CA": "French (Canada)",
+    "he": "Hebrew",
+    "hi": "Hindi",
+    "hr": "Croatian",
+    "hu": "Hungarian",
+    "id": "Indonesian",
+    "it": "Italian",
+    "ja": "Japanese",
+    "kn": "Kannada",
+    "ko": "Korean",
+    "ko-KR": "Korean (South Korea)",
+    "lt": "Lithuanian",
+    "lv": "Latvian",
+    "mk": "Macedonian",
+    "mr": "Marathi",
+    "ms": "Malay",
+    "nl": "Dutch",
+    "nl-BE": "Flemish",
+    "no": "Norwegian",
+    "pl": "Polish",
+    "pt": "Portuguese",
+    "pt-BR": "Portuguese (Brazil)",
+    "pt-PT": "Portuguese (Portugal)",
+    "ro": "Romanian",
+    "ru": "Russian",
+    "sk": "Slovak",
+    "sl": "Slovenian",
+    "sr": "Serbian",
+    "sv": "Swedish",
+    "sv-SE": "Swedish (Sweden)",
+    "ta": "Tamil",
+    "te": "Telugu",
+    "th": "Thai",
+    "tl": "Tagalog",
+    "tr": "Turkish",
+    "uk": "Ukrainian",
+    "ur": "Urdu",
+    "vi": "Vietnamese",
+    "zh-CN": "Chinese (Simplified)",
+    "zh-TW": "Chinese (Traditional)",
+    // Sarvam Indian languages
+    "bn-IN": "Bengali",
+    "gu-IN": "Gujarati",
+    "hi-IN": "Hindi",
+    "kn-IN": "Kannada",
+    "ml-IN": "Malayalam",
+    "mr-IN": "Marathi",
+    "od-IN": "Odia",
+    "pa-IN": "Punjabi",
+    "ta-IN": "Tamil",
+    "te-IN": "Telugu",
+    "as-IN": "Assamese",
+};