chore: add custom recordings documentation

2026-07-22 11:51:04 +02:00 · 2026-03-25 15:44:54 +05:30 · 2026-03-25 15:44:54 +05:30 · dc800bdd63
commit dc800bdd63
parent 2fa4191d9b
6 changed files with 211 additions and 37 deletions
--- a/api/routes/s3_signed_url.py
+++ b/api/routes/s3_signed_url.py
@ -40,6 +40,38 @@ class PresignedUploadUrlResponse(BaseModel):
 router = APIRouter(prefix="/s3", tags=["s3"])


+def _extract_org_id_from_key(key: str) -> Optional[int]:
+    """Try to extract an organization ID from a storage key.
+
+    Matches keys of the form ``{prefix}/{org_id}/...`` where *org_id* is a
+    positive integer.  Returns ``None`` when the pattern does not match.
+    """
+    parts = key.split("/")
+    if len(parts) >= 3 and parts[1].isdigit():
+        return int(parts[1])
+    return None
+
+
+def _extract_legacy_workflow_run_id(key: str) -> Optional[int]:
+    """Extract a workflow_run_id from legacy key formats.
+
+    Supports:
+      - ``transcripts/{run_id}.txt``
+      - ``recordings/{run_id}.wav``
+
+    Returns ``None`` when the key does not match a legacy pattern.
+    """
+    if key.startswith("transcripts/") and key.endswith(".txt"):
+        run_id_str = key[len("transcripts/") : -4]
+    elif key.startswith("recordings/") and key.endswith(".wav"):
+        run_id_str = key[len("recordings/") : -4]
+    else:
+        return None
+
+    return int(run_id_str) if run_id_str.isdigit() else None
+
+
+# Keep for backward compat with file-metadata endpoint
 async def _validate_and_extract_workflow_run_id(
    key: str, allow_special_paths: bool = False
 ) -> Optional[int]:
@ -118,64 +150,68 @@ async def get_signed_url(
    key: Annotated[str, Query(description="S3 object key")],
    expires_in: int = 3600,
    inline: bool = False,
+    storage_backend: Annotated[
+        Optional[str],
+        Query(
+            description="Storage backend to use (e.g. 'minio', 's3'). "
+            "When omitted the backend is inferred from the resource."
+        ),
+    ] = None,
    user=Depends(get_user),
 ):
-    """Return a short-lived signed URL for a transcript or recording file stored on S3.
+    """Return a short-lived signed URL for a file stored on S3 / MinIO.

    Access Control:
+    * Keys that embed an organization ID (``{prefix}/{org_id}/...``) are
+      authorized by matching the org_id against the requesting user's
+      organization.
+    * Legacy keys (``recordings/{run_id}.wav``, ``transcripts/{run_id}.txt``)
+      are authorized via the workflow run they belong to.
    * Superusers can request any key.
-    * Regular users can only request resources belonging to **their** workflow runs.
    """

-    # Validate key and extract workflow_run_id (don't allow special paths for signed URLs)
-    run_id = await _validate_and_extract_workflow_run_id(key, allow_special_paths=False)
-    if run_id is None:
-        raise HTTPException(status_code=400, detail="Invalid key format")
+    # ------------------------------------------------------------------
+    # 1. Authorize
+    # ------------------------------------------------------------------
+    workflow_run = None

-    # Authorize and get workflow run
-    workflow_run = await _authorize_and_get_workflow_run(run_id, user)
+    org_id = _extract_org_id_from_key(key)
+    if org_id is not None:
+        # Generic org-based auth
+        if not user.is_superuser and org_id != user.selected_organization_id:
+            raise HTTPException(status_code=403, detail="Access denied")
+    else:
+        # Legacy workflow-run-based auth
+        run_id = _extract_legacy_workflow_run_id(key)
+        if run_id is None:
+            raise HTTPException(status_code=400, detail="Invalid key format")
+        workflow_run = await _authorize_and_get_workflow_run(run_id, user)

    # ------------------------------------------------------------------
-    # 3. Generate the signed URL using the correct storage backend
+    # 2. Resolve storage backend
    # ------------------------------------------------------------------
    try:
-        # Use the storage backend recorded when the file was uploaded
-        if (
+        if storage_backend:
+            storage = get_storage_for_backend(storage_backend)
+        elif (
            workflow_run
            and hasattr(workflow_run, "storage_backend")
            and workflow_run.storage_backend
        ):
-            backend = workflow_run.storage_backend
-            storage = get_storage_for_backend(backend)
-            logger.info(
-                f"DOWNLOAD: Using stored {backend} (value: {backend}) for signed URL generation - workflow_run_id: {run_id}, key: {key}"
-            )
+            storage = get_storage_for_backend(workflow_run.storage_backend)
        else:
-            # Fallback to current storage for legacy records without storage_backend
            storage = storage_fs
-            current_backend = StorageBackend.get_current_backend()
-            logger.warning(
-                f"DOWNLOAD: No storage_backend found for workflow run {run_id}, falling back to current {current_backend.name} - key: {key}"
-            )

+        # ------------------------------------------------------------------
+        # 3. Generate the signed URL
+        # ------------------------------------------------------------------
        url = await storage.aget_signed_url(
            key, expiration=expires_in, force_inline=inline
        )
        if not url:
            raise HTTPException(status_code=500, detail="Failed to generate signed URL")

-        # Log successful URL generation
-        backend_info = (
-            f"stored {backend}"
-            if workflow_run
-            and hasattr(workflow_run, "storage_backend")
-            and workflow_run.storage_backend
-            else f"current {StorageBackend.get_current_backend().name}"
-        )
-        logger.info(
-            f"Successfully generated signed URL using {backend_info} - expires in {expires_in}s"
-        )
-
+        logger.info(f"Generated signed URL for key={key}, expires_in={expires_in}s")
        return {"url": url, "expires_in": expires_in}
    except ClientError as exc:
        logger.error(f"Error generating signed URL: {exc}")
--- a/docs/docs.json
+++ b/docs/docs.json
@ -54,6 +54,7 @@
            "pages": [
              "voice-agent/introduction",
              "voice-agent/editing-a-workflow",
+              "voice-agent/custom-recordings",
              "voice-agent/template-variables",
              {
                "group": "Tools",
--- a/docs/voice-agent/custom-recordings.mdx
+++ b/docs/voice-agent/custom-recordings.mdx
@ -0,0 +1,79 @@
+---
+title: "Custom Recordings"
+description: "Build hybrid voice agents that combine pre-recorded audio with dynamic text generation for lower latency, reduced TTS costs, and natural-sounding conversations."
+---
+
+Custom recordings allow you to build **hybrid voice agents** that use your own pre-recorded audio for key parts of the conversation, while falling back to LLM-generated speech (via a cloned voice) for dynamic responses. This gives you the best of both worlds — the emotional depth of real human speech and the flexibility of AI-generated dialogue.
+
+<iframe
+  width="560"
+  height="315"
+  src="https://www.youtube.com/embed/1uZqhG0_cIo"
+  title="YouTube video player"
+  frameborder="0"
+  allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture; web-share"
+  referrerpolicy="strict-origin-when-cross-origin"
+  allowfullscreen>
+</iframe>
+
+## Why use custom recordings?
+
+- **Reduced TTS cost** — Pre-recorded audio is played directly, so you are not charged for TTS synthesis on those segments.
+- **Emotional variance** — Real recordings carry natural intonation and emotion that TTS cannot fully replicate.
+- **Lower latency** — Playing a pre-recorded clip is faster than synthesizing text at runtime.
+
+## Prerequisites
+
+- A TTS provider that supports **voice cloning** (e.g., Cartesia, ElevenLabs, or Deepgram).
+- An API key for your chosen TTS provider, configured in [Voice settings](/configurations/voice).
+
+## Step 1: Clone your voice
+
+Clone your voice with your TTS provider so that dynamically generated speech sounds similar to your recordings. For example, with Cartesia:
+
+1. Go to Cartesia and navigate to **Instant Clone**.
+2. Record a short audio clip (up to 10 seconds) of your voice.
+3. Give the clone a name and select your language.
+4. Copy the **Voice ID** — you will need it in the next step.
+
+<Note>
+You can use any TTS provider that supports voice cloning. The steps will vary by provider, but the key output is always a **Voice ID** tied to your cloned voice.
+</Note>
+
+## Step 2: Configure the cloned voice in Dograh
+
+1. Go to your agent's **Model Configuration** in the Dograh dashboard.
+2. Under voice settings, select **Add Voice ID manually**.
+3. Paste the Voice ID from your cloned voice.
+4. Make sure the **provider** matches where you cloned your voice (e.g., Cartesia).
+5. Enter the provider's API key if you haven't already.
+6. Save the configuration.
+
+## Step 3: Upload recordings
+
+Navigate to your agent in the workflow builder and open the **Recordings** panel. You can either upload pre-recorded audio files or record directly in the browser.
+
+For each recording:
+
+1. Click **Record** (or upload a file).
+2. Speak the exact phrase you want the agent to use.
+3. Give the recording a descriptive name (e.g., `greeting`, `invitation`, `venue`).
+4. Verify the transcription is correct — edit it if needed.
+5. Click **Upload**.
+
+<Warning>
+Recordings are scoped to a specific **provider and Voice ID**. If you change either, you will need to re-upload your recordings to ensure consistency between the recorded audio and the cloned voice used for dynamic responses.
+</Warning>
+
+## Step 4: Build the workflow
+
+Open your agent's workflow and write the conversation flow in natural language. To insert a recording, type **`@`** in the prompt editor — this will show a list of all available recordings scoped to your current Voice ID.
+
+For any user question that falls outside your recordings, the agent automatically generates a dynamic response using the LLM, which is then synthesized using your cloned voice via TTS.
+
+## Tips for best results
+
+- **Record in a quiet environment** to improve audio quality and consistency with the cloned voice.
+- **Use pro cloning services** (when available) and provide more sample audio for a higher-quality voice clone.
+- **Keep recordings concise** — short, focused clips work best for specific conversation moments.
+- **Review call recordings** after testing to identify where the transition between pre-recorded and dynamic audio can be improved.
--- a/ui/src/app/workflow/[workflowId]/components/RecordingsDialog.tsx
+++ b/ui/src/app/workflow/[workflowId]/components/RecordingsDialog.tsx
@ -1,9 +1,10 @@
-import { Loader2, Mic, Square, Trash2Icon, Upload } from "lucide-react";
+import { Loader2, Mic, Pause, Play, Square, Trash2Icon, Upload } from "lucide-react";
 import { useCallback, useEffect, useRef, useState } from "react";

 import {
    createRecordingApiV1WorkflowRecordingsPost,
    deleteRecordingApiV1WorkflowRecordingsRecordingIdDelete,
+    getSignedUrlApiV1S3SignedUrlGet,
    getUploadUrlApiV1WorkflowRecordingsUploadUrlPost,
    listRecordingsApiV1WorkflowRecordingsGet,
    transcribeAudioApiV1WorkflowRecordingsTranscribePost,
@ -58,6 +59,8 @@ export const RecordingsDialog = ({
    const [recordingStep, setRecordingStep] = useState<RecordingStep>("idle");
    const [recordingFilename, setRecordingFilename] = useState("");
    const [recordingDuration, setRecordingDuration] = useState(0);
+    const [playingId, setPlayingId] = useState<string | null>(null);
+    const audioRef = useRef<HTMLAudioElement | null>(null);
    const mediaRecorderRef = useRef<MediaRecorder | null>(null);
    const audioChunksRef = useRef<Blob[]>([]);
    const recordingTimerRef = useRef<ReturnType<typeof setInterval> | null>(null);
@ -110,6 +113,14 @@ export const RecordingsDialog = ({
        setRecordingDuration(0);
    }, []);

+    const stopPlayback = useCallback(() => {
+        if (audioRef.current) {
+            audioRef.current.pause();
+            audioRef.current = null;
+        }
+        setPlayingId(null);
+    }, []);
+
    useEffect(() => {
        if (open) {
            fetchRecordings();
@ -125,8 +136,9 @@ export const RecordingsDialog = ({
        if (!open) {
            stopRecording();
            stopRecordingTimer();
+            stopPlayback();
        }
-    }, [open, stopRecording, stopRecordingTimer]);
+    }, [open, stopRecording, stopRecordingTimer, stopPlayback]);

    const transcribeFile = async (file: File) => {
        setRecordingStep("transcribing");
@ -295,6 +307,33 @@ export const RecordingsDialog = ({
        }
    };

+    const handlePlay = async (rec: RecordingResponseSchema) => {
+        if (playingId === rec.recording_id) {
+            stopPlayback();
+            return;
+        }
+        stopPlayback();
+        try {
+            const result = await getSignedUrlApiV1S3SignedUrlGet({
+                query: {
+                    key: rec.storage_key,
+                    storage_backend: rec.storage_backend,
+                },
+            });
+            if (!result.data?.url) {
+                setError("Failed to get audio URL");
+                return;
+            }
+            const audio = new Audio(result.data.url);
+            audio.onended = () => setPlayingId(null);
+            audioRef.current = audio;
+            setPlayingId(rec.recording_id);
+            await audio.play();
+        } catch {
+            setError("Failed to play recording");
+        }
+    };
+
    const isRecording = recordingStep === "recording";
    const isTranscribing = recordingStep === "transcribing";
    const isBusy = uploading || isRecording || isTranscribing;
@ -540,6 +579,17 @@ export const RecordingsDialog = ({
                                        {rec.transcript}
                                    </p>
                                </div>
+                                <Button
+                                    size="sm"
+                                    variant="ghost"
+                                    onClick={() => handlePlay(rec)}
+                                >
+                                    {playingId === rec.recording_id ? (
+                                        <Pause className="w-4 h-4" />
+                                    ) : (
+                                        <Play className="w-4 h-4" />
+                                    )}
+                                </Button>
                                <Button
                                    size="sm"
                                    variant="ghost"
--- a/ui/src/client/sdk.gen.ts
+++ b/ui/src/client/sdk.gen.ts
@ -1064,11 +1064,15 @@ export const getCampaignDefaultsApiV1OrganizationsCampaignDefaultsGet = <ThrowOn

 /**
 * Generate a signed S3 URL
- * Return a short-lived signed URL for a transcript or recording file stored on S3.
+ * Return a short-lived signed URL for a file stored on S3 / MinIO.
 *
 * Access Control:
+ * * Keys that embed an organization ID (``{prefix}/{org_id}/...``) are
+ * authorized by matching the org_id against the requesting user's
+ * organization.
+ * * Legacy keys (``recordings/{run_id}.wav``, ``transcripts/{run_id}.txt``)
+ * are authorized via the workflow run they belong to.
 * * Superusers can request any key.
- * * Regular users can only request resources belonging to **their** workflow runs.
 */
 export const getSignedUrlApiV1S3SignedUrlGet = <ThrowOnError extends boolean = false>(options: Options<GetSignedUrlApiV1S3SignedUrlGetData, ThrowOnError>) => {
    return (options.client ?? _heyApiClient).get<GetSignedUrlApiV1S3SignedUrlGetResponse, GetSignedUrlApiV1S3SignedUrlGetError, ThrowOnError>({
--- a/ui/src/client/types.gen.ts
+++ b/ui/src/client/types.gen.ts
@ -3964,6 +3964,10 @@ export type GetSignedUrlApiV1S3SignedUrlGetData = {
        key: string;
        expires_in?: number;
        inline?: boolean;
+        /**
+         * Storage backend to use (e.g. 'minio', 's3'). When omitted the backend is inferred from the resource.
+         */
+        storage_backend?: string | null;
    };
    url: '/api/v1/s3/signed-url';
 };