diff --git a/api/routes/s3_signed_url.py b/api/routes/s3_signed_url.py index 2889278..2ea2dda 100644 --- a/api/routes/s3_signed_url.py +++ b/api/routes/s3_signed_url.py @@ -40,6 +40,38 @@ class PresignedUploadUrlResponse(BaseModel): router = APIRouter(prefix="/s3", tags=["s3"]) +def _extract_org_id_from_key(key: str) -> Optional[int]: + """Try to extract an organization ID from a storage key. + + Matches keys of the form ``{prefix}/{org_id}/...`` where *org_id* is a + positive integer. Returns ``None`` when the pattern does not match. + """ + parts = key.split("/") + if len(parts) >= 3 and parts[1].isdigit(): + return int(parts[1]) + return None + + +def _extract_legacy_workflow_run_id(key: str) -> Optional[int]: + """Extract a workflow_run_id from legacy key formats. + + Supports: + - ``transcripts/{run_id}.txt`` + - ``recordings/{run_id}.wav`` + + Returns ``None`` when the key does not match a legacy pattern. + """ + if key.startswith("transcripts/") and key.endswith(".txt"): + run_id_str = key[len("transcripts/") : -4] + elif key.startswith("recordings/") and key.endswith(".wav"): + run_id_str = key[len("recordings/") : -4] + else: + return None + + return int(run_id_str) if run_id_str.isdigit() else None + + +# Keep for backward compat with file-metadata endpoint async def _validate_and_extract_workflow_run_id( key: str, allow_special_paths: bool = False ) -> Optional[int]: @@ -118,64 +150,68 @@ async def get_signed_url( key: Annotated[str, Query(description="S3 object key")], expires_in: int = 3600, inline: bool = False, + storage_backend: Annotated[ + Optional[str], + Query( + description="Storage backend to use (e.g. 'minio', 's3'). " + "When omitted the backend is inferred from the resource." + ), + ] = None, user=Depends(get_user), ): - """Return a short-lived signed URL for a transcript or recording file stored on S3. + """Return a short-lived signed URL for a file stored on S3 / MinIO. Access Control: + * Keys that embed an organization ID (``{prefix}/{org_id}/...``) are + authorized by matching the org_id against the requesting user's + organization. + * Legacy keys (``recordings/{run_id}.wav``, ``transcripts/{run_id}.txt``) + are authorized via the workflow run they belong to. * Superusers can request any key. - * Regular users can only request resources belonging to **their** workflow runs. """ - # Validate key and extract workflow_run_id (don't allow special paths for signed URLs) - run_id = await _validate_and_extract_workflow_run_id(key, allow_special_paths=False) - if run_id is None: - raise HTTPException(status_code=400, detail="Invalid key format") + # ------------------------------------------------------------------ + # 1. Authorize + # ------------------------------------------------------------------ + workflow_run = None - # Authorize and get workflow run - workflow_run = await _authorize_and_get_workflow_run(run_id, user) + org_id = _extract_org_id_from_key(key) + if org_id is not None: + # Generic org-based auth + if not user.is_superuser and org_id != user.selected_organization_id: + raise HTTPException(status_code=403, detail="Access denied") + else: + # Legacy workflow-run-based auth + run_id = _extract_legacy_workflow_run_id(key) + if run_id is None: + raise HTTPException(status_code=400, detail="Invalid key format") + workflow_run = await _authorize_and_get_workflow_run(run_id, user) # ------------------------------------------------------------------ - # 3. Generate the signed URL using the correct storage backend + # 2. Resolve storage backend # ------------------------------------------------------------------ try: - # Use the storage backend recorded when the file was uploaded - if ( + if storage_backend: + storage = get_storage_for_backend(storage_backend) + elif ( workflow_run and hasattr(workflow_run, "storage_backend") and workflow_run.storage_backend ): - backend = workflow_run.storage_backend - storage = get_storage_for_backend(backend) - logger.info( - f"DOWNLOAD: Using stored {backend} (value: {backend}) for signed URL generation - workflow_run_id: {run_id}, key: {key}" - ) + storage = get_storage_for_backend(workflow_run.storage_backend) else: - # Fallback to current storage for legacy records without storage_backend storage = storage_fs - current_backend = StorageBackend.get_current_backend() - logger.warning( - f"DOWNLOAD: No storage_backend found for workflow run {run_id}, falling back to current {current_backend.name} - key: {key}" - ) + # ------------------------------------------------------------------ + # 3. Generate the signed URL + # ------------------------------------------------------------------ url = await storage.aget_signed_url( key, expiration=expires_in, force_inline=inline ) if not url: raise HTTPException(status_code=500, detail="Failed to generate signed URL") - # Log successful URL generation - backend_info = ( - f"stored {backend}" - if workflow_run - and hasattr(workflow_run, "storage_backend") - and workflow_run.storage_backend - else f"current {StorageBackend.get_current_backend().name}" - ) - logger.info( - f"Successfully generated signed URL using {backend_info} - expires in {expires_in}s" - ) - + logger.info(f"Generated signed URL for key={key}, expires_in={expires_in}s") return {"url": url, "expires_in": expires_in} except ClientError as exc: logger.error(f"Error generating signed URL: {exc}") diff --git a/docs/docs.json b/docs/docs.json index 0c3ef4b..9b55211 100644 --- a/docs/docs.json +++ b/docs/docs.json @@ -54,6 +54,7 @@ "pages": [ "voice-agent/introduction", "voice-agent/editing-a-workflow", + "voice-agent/custom-recordings", "voice-agent/template-variables", { "group": "Tools", diff --git a/docs/voice-agent/custom-recordings.mdx b/docs/voice-agent/custom-recordings.mdx new file mode 100644 index 0000000..6b3205a --- /dev/null +++ b/docs/voice-agent/custom-recordings.mdx @@ -0,0 +1,79 @@ +--- +title: "Custom Recordings" +description: "Build hybrid voice agents that combine pre-recorded audio with dynamic text generation for lower latency, reduced TTS costs, and natural-sounding conversations." +--- + +Custom recordings allow you to build **hybrid voice agents** that use your own pre-recorded audio for key parts of the conversation, while falling back to LLM-generated speech (via a cloned voice) for dynamic responses. This gives you the best of both worlds — the emotional depth of real human speech and the flexibility of AI-generated dialogue. + + + +## Why use custom recordings? + +- **Reduced TTS cost** — Pre-recorded audio is played directly, so you are not charged for TTS synthesis on those segments. +- **Emotional variance** — Real recordings carry natural intonation and emotion that TTS cannot fully replicate. +- **Lower latency** — Playing a pre-recorded clip is faster than synthesizing text at runtime. + +## Prerequisites + +- A TTS provider that supports **voice cloning** (e.g., Cartesia, ElevenLabs, or Deepgram). +- An API key for your chosen TTS provider, configured in [Voice settings](/configurations/voice). + +## Step 1: Clone your voice + +Clone your voice with your TTS provider so that dynamically generated speech sounds similar to your recordings. For example, with Cartesia: + +1. Go to Cartesia and navigate to **Instant Clone**. +2. Record a short audio clip (up to 10 seconds) of your voice. +3. Give the clone a name and select your language. +4. Copy the **Voice ID** — you will need it in the next step. + + +You can use any TTS provider that supports voice cloning. The steps will vary by provider, but the key output is always a **Voice ID** tied to your cloned voice. + + +## Step 2: Configure the cloned voice in Dograh + +1. Go to your agent's **Model Configuration** in the Dograh dashboard. +2. Under voice settings, select **Add Voice ID manually**. +3. Paste the Voice ID from your cloned voice. +4. Make sure the **provider** matches where you cloned your voice (e.g., Cartesia). +5. Enter the provider's API key if you haven't already. +6. Save the configuration. + +## Step 3: Upload recordings + +Navigate to your agent in the workflow builder and open the **Recordings** panel. You can either upload pre-recorded audio files or record directly in the browser. + +For each recording: + +1. Click **Record** (or upload a file). +2. Speak the exact phrase you want the agent to use. +3. Give the recording a descriptive name (e.g., `greeting`, `invitation`, `venue`). +4. Verify the transcription is correct — edit it if needed. +5. Click **Upload**. + + +Recordings are scoped to a specific **provider and Voice ID**. If you change either, you will need to re-upload your recordings to ensure consistency between the recorded audio and the cloned voice used for dynamic responses. + + +## Step 4: Build the workflow + +Open your agent's workflow and write the conversation flow in natural language. To insert a recording, type **`@`** in the prompt editor — this will show a list of all available recordings scoped to your current Voice ID. + +For any user question that falls outside your recordings, the agent automatically generates a dynamic response using the LLM, which is then synthesized using your cloned voice via TTS. + +## Tips for best results + +- **Record in a quiet environment** to improve audio quality and consistency with the cloned voice. +- **Use pro cloning services** (when available) and provide more sample audio for a higher-quality voice clone. +- **Keep recordings concise** — short, focused clips work best for specific conversation moments. +- **Review call recordings** after testing to identify where the transition between pre-recorded and dynamic audio can be improved. diff --git a/ui/src/app/workflow/[workflowId]/components/RecordingsDialog.tsx b/ui/src/app/workflow/[workflowId]/components/RecordingsDialog.tsx index b917f2b..2f0692f 100644 --- a/ui/src/app/workflow/[workflowId]/components/RecordingsDialog.tsx +++ b/ui/src/app/workflow/[workflowId]/components/RecordingsDialog.tsx @@ -1,9 +1,10 @@ -import { Loader2, Mic, Square, Trash2Icon, Upload } from "lucide-react"; +import { Loader2, Mic, Pause, Play, Square, Trash2Icon, Upload } from "lucide-react"; import { useCallback, useEffect, useRef, useState } from "react"; import { createRecordingApiV1WorkflowRecordingsPost, deleteRecordingApiV1WorkflowRecordingsRecordingIdDelete, + getSignedUrlApiV1S3SignedUrlGet, getUploadUrlApiV1WorkflowRecordingsUploadUrlPost, listRecordingsApiV1WorkflowRecordingsGet, transcribeAudioApiV1WorkflowRecordingsTranscribePost, @@ -58,6 +59,8 @@ export const RecordingsDialog = ({ const [recordingStep, setRecordingStep] = useState("idle"); const [recordingFilename, setRecordingFilename] = useState(""); const [recordingDuration, setRecordingDuration] = useState(0); + const [playingId, setPlayingId] = useState(null); + const audioRef = useRef(null); const mediaRecorderRef = useRef(null); const audioChunksRef = useRef([]); const recordingTimerRef = useRef | null>(null); @@ -110,6 +113,14 @@ export const RecordingsDialog = ({ setRecordingDuration(0); }, []); + const stopPlayback = useCallback(() => { + if (audioRef.current) { + audioRef.current.pause(); + audioRef.current = null; + } + setPlayingId(null); + }, []); + useEffect(() => { if (open) { fetchRecordings(); @@ -125,8 +136,9 @@ export const RecordingsDialog = ({ if (!open) { stopRecording(); stopRecordingTimer(); + stopPlayback(); } - }, [open, stopRecording, stopRecordingTimer]); + }, [open, stopRecording, stopRecordingTimer, stopPlayback]); const transcribeFile = async (file: File) => { setRecordingStep("transcribing"); @@ -295,6 +307,33 @@ export const RecordingsDialog = ({ } }; + const handlePlay = async (rec: RecordingResponseSchema) => { + if (playingId === rec.recording_id) { + stopPlayback(); + return; + } + stopPlayback(); + try { + const result = await getSignedUrlApiV1S3SignedUrlGet({ + query: { + key: rec.storage_key, + storage_backend: rec.storage_backend, + }, + }); + if (!result.data?.url) { + setError("Failed to get audio URL"); + return; + } + const audio = new Audio(result.data.url); + audio.onended = () => setPlayingId(null); + audioRef.current = audio; + setPlayingId(rec.recording_id); + await audio.play(); + } catch { + setError("Failed to play recording"); + } + }; + const isRecording = recordingStep === "recording"; const isTranscribing = recordingStep === "transcribing"; const isBusy = uploading || isRecording || isTranscribing; @@ -540,6 +579,17 @@ export const RecordingsDialog = ({ {rec.transcript}

+