From dc800bdd6389e01581fea8d30a17bfe3ba3faeb1 Mon Sep 17 00:00:00 2001
From: Abhishek Kumar <abhishek@a6k.me>
Date: Wed, 25 Mar 2026 15:44:54 +0530
Subject: [PATCH] chore: add custom recordings documentation

---
 api/routes/s3_signed_url.py                   | 102 ++++++++++++------
 docs/docs.json                                |   1 +
 docs/voice-agent/custom-recordings.mdx        |  79 ++++++++++++++
 .../components/RecordingsDialog.tsx           |  54 +++++++++-
 ui/src/client/sdk.gen.ts                      |   8 +-
 ui/src/client/types.gen.ts                    |   4 +
 6 files changed, 211 insertions(+), 37 deletions(-)
 create mode 100644 docs/voice-agent/custom-recordings.mdx

diff --git a/api/routes/s3_signed_url.py b/api/routes/s3_signed_url.py
index 2889278..2ea2dda 100644
--- a/api/routes/s3_signed_url.py
+++ b/api/routes/s3_signed_url.py
@@ -40,6 +40,38 @@ class PresignedUploadUrlResponse(BaseModel):
 router = APIRouter(prefix="/s3", tags=["s3"])
 
 
+def _extract_org_id_from_key(key: str) -> Optional[int]:
+    """Try to extract an organization ID from a storage key.
+
+    Matches keys of the form ``{prefix}/{org_id}/...`` where *org_id* is a
+    positive integer.  Returns ``None`` when the pattern does not match.
+    """
+    parts = key.split("/")
+    if len(parts) >= 3 and parts[1].isdigit():
+        return int(parts[1])
+    return None
+
+
+def _extract_legacy_workflow_run_id(key: str) -> Optional[int]:
+    """Extract a workflow_run_id from legacy key formats.
+
+    Supports:
+      - ``transcripts/{run_id}.txt``
+      - ``recordings/{run_id}.wav``
+
+    Returns ``None`` when the key does not match a legacy pattern.
+    """
+    if key.startswith("transcripts/") and key.endswith(".txt"):
+        run_id_str = key[len("transcripts/") : -4]
+    elif key.startswith("recordings/") and key.endswith(".wav"):
+        run_id_str = key[len("recordings/") : -4]
+    else:
+        return None
+
+    return int(run_id_str) if run_id_str.isdigit() else None
+
+
+# Keep for backward compat with file-metadata endpoint
 async def _validate_and_extract_workflow_run_id(
     key: str, allow_special_paths: bool = False
 ) -> Optional[int]:
@@ -118,64 +150,68 @@ async def get_signed_url(
     key: Annotated[str, Query(description="S3 object key")],
     expires_in: int = 3600,
     inline: bool = False,
+    storage_backend: Annotated[
+        Optional[str],
+        Query(
+            description="Storage backend to use (e.g. 'minio', 's3'). "
+            "When omitted the backend is inferred from the resource."
+        ),
+    ] = None,
     user=Depends(get_user),
 ):
-    """Return a short-lived signed URL for a transcript or recording file stored on S3.
+    """Return a short-lived signed URL for a file stored on S3 / MinIO.
 
     Access Control:
+    * Keys that embed an organization ID (``{prefix}/{org_id}/...``) are
+      authorized by matching the org_id against the requesting user's
+      organization.
+    * Legacy keys (``recordings/{run_id}.wav``, ``transcripts/{run_id}.txt``)
+      are authorized via the workflow run they belong to.
     * Superusers can request any key.
-    * Regular users can only request resources belonging to **their** workflow runs.
     """
 
-    # Validate key and extract workflow_run_id (don't allow special paths for signed URLs)
-    run_id = await _validate_and_extract_workflow_run_id(key, allow_special_paths=False)
-    if run_id is None:
-        raise HTTPException(status_code=400, detail="Invalid key format")
+    # ------------------------------------------------------------------
+    # 1. Authorize
+    # ------------------------------------------------------------------
+    workflow_run = None
 
-    # Authorize and get workflow run
-    workflow_run = await _authorize_and_get_workflow_run(run_id, user)
+    org_id = _extract_org_id_from_key(key)
+    if org_id is not None:
+        # Generic org-based auth
+        if not user.is_superuser and org_id != user.selected_organization_id:
+            raise HTTPException(status_code=403, detail="Access denied")
+    else:
+        # Legacy workflow-run-based auth
+        run_id = _extract_legacy_workflow_run_id(key)
+        if run_id is None:
+            raise HTTPException(status_code=400, detail="Invalid key format")
+        workflow_run = await _authorize_and_get_workflow_run(run_id, user)
 
     # ------------------------------------------------------------------
-    # 3. Generate the signed URL using the correct storage backend
+    # 2. Resolve storage backend
     # ------------------------------------------------------------------
     try:
-        # Use the storage backend recorded when the file was uploaded
-        if (
+        if storage_backend:
+            storage = get_storage_for_backend(storage_backend)
+        elif (
             workflow_run
             and hasattr(workflow_run, "storage_backend")
             and workflow_run.storage_backend
         ):
-            backend = workflow_run.storage_backend
-            storage = get_storage_for_backend(backend)
-            logger.info(
-                f"DOWNLOAD: Using stored {backend} (value: {backend}) for signed URL generation - workflow_run_id: {run_id}, key: {key}"
-            )
+            storage = get_storage_for_backend(workflow_run.storage_backend)
         else:
-            # Fallback to current storage for legacy records without storage_backend
             storage = storage_fs
-            current_backend = StorageBackend.get_current_backend()
-            logger.warning(
-                f"DOWNLOAD: No storage_backend found for workflow run {run_id}, falling back to current {current_backend.name} - key: {key}"
-            )
 
+        # ------------------------------------------------------------------
+        # 3. Generate the signed URL
+        # ------------------------------------------------------------------
         url = await storage.aget_signed_url(
             key, expiration=expires_in, force_inline=inline
         )
         if not url:
             raise HTTPException(status_code=500, detail="Failed to generate signed URL")
 
-        # Log successful URL generation
-        backend_info = (
-            f"stored {backend}"
-            if workflow_run
-            and hasattr(workflow_run, "storage_backend")
-            and workflow_run.storage_backend
-            else f"current {StorageBackend.get_current_backend().name}"
-        )
-        logger.info(
-            f"Successfully generated signed URL using {backend_info} - expires in {expires_in}s"
-        )
-
+        logger.info(f"Generated signed URL for key={key}, expires_in={expires_in}s")
         return {"url": url, "expires_in": expires_in}
     except ClientError as exc:
         logger.error(f"Error generating signed URL: {exc}")
diff --git a/docs/docs.json b/docs/docs.json
index 0c3ef4b..9b55211 100644
--- a/docs/docs.json
+++ b/docs/docs.json
@@ -54,6 +54,7 @@
             "pages": [
               "voice-agent/introduction",
               "voice-agent/editing-a-workflow",
+              "voice-agent/custom-recordings",
               "voice-agent/template-variables",
               {
                 "group": "Tools",
diff --git a/docs/voice-agent/custom-recordings.mdx b/docs/voice-agent/custom-recordings.mdx
new file mode 100644
index 0000000..6b3205a
--- /dev/null
+++ b/docs/voice-agent/custom-recordings.mdx
@@ -0,0 +1,79 @@
+---
+title: "Custom Recordings"
+description: "Build hybrid voice agents that combine pre-recorded audio with dynamic text generation for lower latency, reduced TTS costs, and natural-sounding conversations."
+---
+
+Custom recordings allow you to build **hybrid voice agents** that use your own pre-recorded audio for key parts of the conversation, while falling back to LLM-generated speech (via a cloned voice) for dynamic responses. This gives you the best of both worlds — the emotional depth of real human speech and the flexibility of AI-generated dialogue.
+
+<iframe
+  width="560"
+  height="315"
+  src="https://www.youtube.com/embed/1uZqhG0_cIo"
+  title="YouTube video player"
+  frameborder="0"
+  allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture; web-share"
+  referrerpolicy="strict-origin-when-cross-origin"
+  allowfullscreen>
+</iframe>
+
+## Why use custom recordings?
+
+- **Reduced TTS cost** — Pre-recorded audio is played directly, so you are not charged for TTS synthesis on those segments.
+- **Emotional variance** — Real recordings carry natural intonation and emotion that TTS cannot fully replicate.
+- **Lower latency** — Playing a pre-recorded clip is faster than synthesizing text at runtime.
+
+## Prerequisites
+
+- A TTS provider that supports **voice cloning** (e.g., Cartesia, ElevenLabs, or Deepgram).
+- An API key for your chosen TTS provider, configured in [Voice settings](/configurations/voice).
+
+## Step 1: Clone your voice
+
+Clone your voice with your TTS provider so that dynamically generated speech sounds similar to your recordings. For example, with Cartesia:
+
+1. Go to Cartesia and navigate to **Instant Clone**.
+2. Record a short audio clip (up to 10 seconds) of your voice.
+3. Give the clone a name and select your language.
+4. Copy the **Voice ID** — you will need it in the next step.
+
+<Note>
+You can use any TTS provider that supports voice cloning. The steps will vary by provider, but the key output is always a **Voice ID** tied to your cloned voice.
+</Note>
+
+## Step 2: Configure the cloned voice in Dograh
+
+1. Go to your agent's **Model Configuration** in the Dograh dashboard.
+2. Under voice settings, select **Add Voice ID manually**.
+3. Paste the Voice ID from your cloned voice.
+4. Make sure the **provider** matches where you cloned your voice (e.g., Cartesia).
+5. Enter the provider's API key if you haven't already.
+6. Save the configuration.
+
+## Step 3: Upload recordings
+
+Navigate to your agent in the workflow builder and open the **Recordings** panel. You can either upload pre-recorded audio files or record directly in the browser.
+
+For each recording:
+
+1. Click **Record** (or upload a file).
+2. Speak the exact phrase you want the agent to use.
+3. Give the recording a descriptive name (e.g., `greeting`, `invitation`, `venue`).
+4. Verify the transcription is correct — edit it if needed.
+5. Click **Upload**.
+
+<Warning>
+Recordings are scoped to a specific **provider and Voice ID**. If you change either, you will need to re-upload your recordings to ensure consistency between the recorded audio and the cloned voice used for dynamic responses.
+</Warning>
+
+## Step 4: Build the workflow
+
+Open your agent's workflow and write the conversation flow in natural language. To insert a recording, type **`@`** in the prompt editor — this will show a list of all available recordings scoped to your current Voice ID.
+
+For any user question that falls outside your recordings, the agent automatically generates a dynamic response using the LLM, which is then synthesized using your cloned voice via TTS.
+
+## Tips for best results
+
+- **Record in a quiet environment** to improve audio quality and consistency with the cloned voice.
+- **Use pro cloning services** (when available) and provide more sample audio for a higher-quality voice clone.
+- **Keep recordings concise** — short, focused clips work best for specific conversation moments.
+- **Review call recordings** after testing to identify where the transition between pre-recorded and dynamic audio can be improved.
diff --git a/ui/src/app/workflow/[workflowId]/components/RecordingsDialog.tsx b/ui/src/app/workflow/[workflowId]/components/RecordingsDialog.tsx
index b917f2b..2f0692f 100644
--- a/ui/src/app/workflow/[workflowId]/components/RecordingsDialog.tsx
+++ b/ui/src/app/workflow/[workflowId]/components/RecordingsDialog.tsx
@@ -1,9 +1,10 @@
-import { Loader2, Mic, Square, Trash2Icon, Upload } from "lucide-react";
+import { Loader2, Mic, Pause, Play, Square, Trash2Icon, Upload } from "lucide-react";
 import { useCallback, useEffect, useRef, useState } from "react";
 
 import {
     createRecordingApiV1WorkflowRecordingsPost,
     deleteRecordingApiV1WorkflowRecordingsRecordingIdDelete,
+    getSignedUrlApiV1S3SignedUrlGet,
     getUploadUrlApiV1WorkflowRecordingsUploadUrlPost,
     listRecordingsApiV1WorkflowRecordingsGet,
     transcribeAudioApiV1WorkflowRecordingsTranscribePost,
@@ -58,6 +59,8 @@ export const RecordingsDialog = ({
     const [recordingStep, setRecordingStep] = useState<RecordingStep>("idle");
     const [recordingFilename, setRecordingFilename] = useState("");
     const [recordingDuration, setRecordingDuration] = useState(0);
+    const [playingId, setPlayingId] = useState<string | null>(null);
+    const audioRef = useRef<HTMLAudioElement | null>(null);
     const mediaRecorderRef = useRef<MediaRecorder | null>(null);
     const audioChunksRef = useRef<Blob[]>([]);
     const recordingTimerRef = useRef<ReturnType<typeof setInterval> | null>(null);
@@ -110,6 +113,14 @@ export const RecordingsDialog = ({
         setRecordingDuration(0);
     }, []);
 
+    const stopPlayback = useCallback(() => {
+        if (audioRef.current) {
+            audioRef.current.pause();
+            audioRef.current = null;
+        }
+        setPlayingId(null);
+    }, []);
+
     useEffect(() => {
         if (open) {
             fetchRecordings();
@@ -125,8 +136,9 @@ export const RecordingsDialog = ({
         if (!open) {
             stopRecording();
             stopRecordingTimer();
+            stopPlayback();
         }
-    }, [open, stopRecording, stopRecordingTimer]);
+    }, [open, stopRecording, stopRecordingTimer, stopPlayback]);
 
     const transcribeFile = async (file: File) => {
         setRecordingStep("transcribing");
@@ -295,6 +307,33 @@ export const RecordingsDialog = ({
         }
     };
 
+    const handlePlay = async (rec: RecordingResponseSchema) => {
+        if (playingId === rec.recording_id) {
+            stopPlayback();
+            return;
+        }
+        stopPlayback();
+        try {
+            const result = await getSignedUrlApiV1S3SignedUrlGet({
+                query: {
+                    key: rec.storage_key,
+                    storage_backend: rec.storage_backend,
+                },
+            });
+            if (!result.data?.url) {
+                setError("Failed to get audio URL");
+                return;
+            }
+            const audio = new Audio(result.data.url);
+            audio.onended = () => setPlayingId(null);
+            audioRef.current = audio;
+            setPlayingId(rec.recording_id);
+            await audio.play();
+        } catch {
+            setError("Failed to play recording");
+        }
+    };
+
     const isRecording = recordingStep === "recording";
     const isTranscribing = recordingStep === "transcribing";
     const isBusy = uploading || isRecording || isTranscribing;
@@ -540,6 +579,17 @@ export const RecordingsDialog = ({
                                         {rec.transcript}
                                     </p>
                                 </div>
+                                <Button
+                                    size="sm"
+                                    variant="ghost"
+                                    onClick={() => handlePlay(rec)}
+                                >
+                                    {playingId === rec.recording_id ? (
+                                        <Pause className="w-4 h-4" />
+                                    ) : (
+                                        <Play className="w-4 h-4" />
+                                    )}
+                                </Button>
                                 <Button
                                     size="sm"
                                     variant="ghost"
diff --git a/ui/src/client/sdk.gen.ts b/ui/src/client/sdk.gen.ts
index ff4e030..319f1f5 100644
--- a/ui/src/client/sdk.gen.ts
+++ b/ui/src/client/sdk.gen.ts
@@ -1064,11 +1064,15 @@ export const getCampaignDefaultsApiV1OrganizationsCampaignDefaultsGet = <ThrowOn
 
 /**
  * Generate a signed S3 URL
- * Return a short-lived signed URL for a transcript or recording file stored on S3.
+ * Return a short-lived signed URL for a file stored on S3 / MinIO.
  *
  * Access Control:
+ * * Keys that embed an organization ID (``{prefix}/{org_id}/...``) are
+ * authorized by matching the org_id against the requesting user's
+ * organization.
+ * * Legacy keys (``recordings/{run_id}.wav``, ``transcripts/{run_id}.txt``)
+ * are authorized via the workflow run they belong to.
  * * Superusers can request any key.
- * * Regular users can only request resources belonging to **their** workflow runs.
  */
 export const getSignedUrlApiV1S3SignedUrlGet = <ThrowOnError extends boolean = false>(options: Options<GetSignedUrlApiV1S3SignedUrlGetData, ThrowOnError>) => {
     return (options.client ?? _heyApiClient).get<GetSignedUrlApiV1S3SignedUrlGetResponse, GetSignedUrlApiV1S3SignedUrlGetError, ThrowOnError>({
diff --git a/ui/src/client/types.gen.ts b/ui/src/client/types.gen.ts
index c3e9507..ae4d005 100644
--- a/ui/src/client/types.gen.ts
+++ b/ui/src/client/types.gen.ts
@@ -3964,6 +3964,10 @@ export type GetSignedUrlApiV1S3SignedUrlGetData = {
         key: string;
         expires_in?: number;
         inline?: boolean;
+        /**
+         * Storage backend to use (e.g. 'minio', 's3'). When omitted the backend is inferred from the resource.
+         */
+        storage_backend?: string | null;
     };
     url: '/api/v1/s3/signed-url';
 };