From dc800bdd6389e01581fea8d30a17bfe3ba3faeb1 Mon Sep 17 00:00:00 2001
From: Abhishek Kumar
Date: Wed, 25 Mar 2026 15:44:54 +0530
Subject: [PATCH] chore: add custom recordings documentation
---
api/routes/s3_signed_url.py | 102 ++++++++++++------
docs/docs.json | 1 +
docs/voice-agent/custom-recordings.mdx | 79 ++++++++++++++
.../components/RecordingsDialog.tsx | 54 +++++++++-
ui/src/client/sdk.gen.ts | 8 +-
ui/src/client/types.gen.ts | 4 +
6 files changed, 211 insertions(+), 37 deletions(-)
create mode 100644 docs/voice-agent/custom-recordings.mdx
diff --git a/api/routes/s3_signed_url.py b/api/routes/s3_signed_url.py
index 2889278..2ea2dda 100644
--- a/api/routes/s3_signed_url.py
+++ b/api/routes/s3_signed_url.py
@@ -40,6 +40,38 @@ class PresignedUploadUrlResponse(BaseModel):
router = APIRouter(prefix="/s3", tags=["s3"])
+def _extract_org_id_from_key(key: str) -> Optional[int]:
+ """Try to extract an organization ID from a storage key.
+
+ Matches keys of the form ``{prefix}/{org_id}/...`` where *org_id* is a
+ positive integer. Returns ``None`` when the pattern does not match.
+ """
+ parts = key.split("/")
+ if len(parts) >= 3 and parts[1].isdigit():
+ return int(parts[1])
+ return None
+
+
+def _extract_legacy_workflow_run_id(key: str) -> Optional[int]:
+ """Extract a workflow_run_id from legacy key formats.
+
+ Supports:
+ - ``transcripts/{run_id}.txt``
+ - ``recordings/{run_id}.wav``
+
+ Returns ``None`` when the key does not match a legacy pattern.
+ """
+ if key.startswith("transcripts/") and key.endswith(".txt"):
+ run_id_str = key[len("transcripts/") : -4]
+ elif key.startswith("recordings/") and key.endswith(".wav"):
+ run_id_str = key[len("recordings/") : -4]
+ else:
+ return None
+
+ return int(run_id_str) if run_id_str.isdigit() else None
+
+
+# Keep for backward compat with file-metadata endpoint
async def _validate_and_extract_workflow_run_id(
key: str, allow_special_paths: bool = False
) -> Optional[int]:
@@ -118,64 +150,68 @@ async def get_signed_url(
key: Annotated[str, Query(description="S3 object key")],
expires_in: int = 3600,
inline: bool = False,
+ storage_backend: Annotated[
+ Optional[str],
+ Query(
+ description="Storage backend to use (e.g. 'minio', 's3'). "
+ "When omitted the backend is inferred from the resource."
+ ),
+ ] = None,
user=Depends(get_user),
):
- """Return a short-lived signed URL for a transcript or recording file stored on S3.
+ """Return a short-lived signed URL for a file stored on S3 / MinIO.
Access Control:
+ * Keys that embed an organization ID (``{prefix}/{org_id}/...``) are
+ authorized by matching the org_id against the requesting user's
+ organization.
+ * Legacy keys (``recordings/{run_id}.wav``, ``transcripts/{run_id}.txt``)
+ are authorized via the workflow run they belong to.
* Superusers can request any key.
- * Regular users can only request resources belonging to **their** workflow runs.
"""
- # Validate key and extract workflow_run_id (don't allow special paths for signed URLs)
- run_id = await _validate_and_extract_workflow_run_id(key, allow_special_paths=False)
- if run_id is None:
- raise HTTPException(status_code=400, detail="Invalid key format")
+ # ------------------------------------------------------------------
+ # 1. Authorize
+ # ------------------------------------------------------------------
+ workflow_run = None
- # Authorize and get workflow run
- workflow_run = await _authorize_and_get_workflow_run(run_id, user)
+ org_id = _extract_org_id_from_key(key)
+ if org_id is not None:
+ # Generic org-based auth
+ if not user.is_superuser and org_id != user.selected_organization_id:
+ raise HTTPException(status_code=403, detail="Access denied")
+ else:
+ # Legacy workflow-run-based auth
+ run_id = _extract_legacy_workflow_run_id(key)
+ if run_id is None:
+ raise HTTPException(status_code=400, detail="Invalid key format")
+ workflow_run = await _authorize_and_get_workflow_run(run_id, user)
# ------------------------------------------------------------------
- # 3. Generate the signed URL using the correct storage backend
+ # 2. Resolve storage backend
# ------------------------------------------------------------------
try:
- # Use the storage backend recorded when the file was uploaded
- if (
+ if storage_backend:
+ storage = get_storage_for_backend(storage_backend)
+ elif (
workflow_run
and hasattr(workflow_run, "storage_backend")
and workflow_run.storage_backend
):
- backend = workflow_run.storage_backend
- storage = get_storage_for_backend(backend)
- logger.info(
- f"DOWNLOAD: Using stored {backend} (value: {backend}) for signed URL generation - workflow_run_id: {run_id}, key: {key}"
- )
+ storage = get_storage_for_backend(workflow_run.storage_backend)
else:
- # Fallback to current storage for legacy records without storage_backend
storage = storage_fs
- current_backend = StorageBackend.get_current_backend()
- logger.warning(
- f"DOWNLOAD: No storage_backend found for workflow run {run_id}, falling back to current {current_backend.name} - key: {key}"
- )
+ # ------------------------------------------------------------------
+ # 3. Generate the signed URL
+ # ------------------------------------------------------------------
url = await storage.aget_signed_url(
key, expiration=expires_in, force_inline=inline
)
if not url:
raise HTTPException(status_code=500, detail="Failed to generate signed URL")
- # Log successful URL generation
- backend_info = (
- f"stored {backend}"
- if workflow_run
- and hasattr(workflow_run, "storage_backend")
- and workflow_run.storage_backend
- else f"current {StorageBackend.get_current_backend().name}"
- )
- logger.info(
- f"Successfully generated signed URL using {backend_info} - expires in {expires_in}s"
- )
-
+ logger.info(f"Generated signed URL for key={key}, expires_in={expires_in}s")
return {"url": url, "expires_in": expires_in}
except ClientError as exc:
logger.error(f"Error generating signed URL: {exc}")
diff --git a/docs/docs.json b/docs/docs.json
index 0c3ef4b..9b55211 100644
--- a/docs/docs.json
+++ b/docs/docs.json
@@ -54,6 +54,7 @@
"pages": [
"voice-agent/introduction",
"voice-agent/editing-a-workflow",
+ "voice-agent/custom-recordings",
"voice-agent/template-variables",
{
"group": "Tools",
diff --git a/docs/voice-agent/custom-recordings.mdx b/docs/voice-agent/custom-recordings.mdx
new file mode 100644
index 0000000..6b3205a
--- /dev/null
+++ b/docs/voice-agent/custom-recordings.mdx
@@ -0,0 +1,79 @@
+---
+title: "Custom Recordings"
+description: "Build hybrid voice agents that combine pre-recorded audio with dynamic text generation for lower latency, reduced TTS costs, and natural-sounding conversations."
+---
+
+Custom recordings allow you to build **hybrid voice agents** that use your own pre-recorded audio for key parts of the conversation, while falling back to LLM-generated speech (via a cloned voice) for dynamic responses. This gives you the best of both worlds — the emotional depth of real human speech and the flexibility of AI-generated dialogue.
+
+
+
+## Why use custom recordings?
+
+- **Reduced TTS cost** — Pre-recorded audio is played directly, so you are not charged for TTS synthesis on those segments.
+- **Emotional variance** — Real recordings carry natural intonation and emotion that TTS cannot fully replicate.
+- **Lower latency** — Playing a pre-recorded clip is faster than synthesizing text at runtime.
+
+## Prerequisites
+
+- A TTS provider that supports **voice cloning** (e.g., Cartesia, ElevenLabs, or Deepgram).
+- An API key for your chosen TTS provider, configured in [Voice settings](/configurations/voice).
+
+## Step 1: Clone your voice
+
+Clone your voice with your TTS provider so that dynamically generated speech sounds similar to your recordings. For example, with Cartesia:
+
+1. Go to Cartesia and navigate to **Instant Clone**.
+2. Record a short audio clip (up to 10 seconds) of your voice.
+3. Give the clone a name and select your language.
+4. Copy the **Voice ID** — you will need it in the next step.
+
+
+You can use any TTS provider that supports voice cloning. The steps will vary by provider, but the key output is always a **Voice ID** tied to your cloned voice.
+
+
+## Step 2: Configure the cloned voice in Dograh
+
+1. Go to your agent's **Model Configuration** in the Dograh dashboard.
+2. Under voice settings, select **Add Voice ID manually**.
+3. Paste the Voice ID from your cloned voice.
+4. Make sure the **provider** matches where you cloned your voice (e.g., Cartesia).
+5. Enter the provider's API key if you haven't already.
+6. Save the configuration.
+
+## Step 3: Upload recordings
+
+Navigate to your agent in the workflow builder and open the **Recordings** panel. You can either upload pre-recorded audio files or record directly in the browser.
+
+For each recording:
+
+1. Click **Record** (or upload a file).
+2. Speak the exact phrase you want the agent to use.
+3. Give the recording a descriptive name (e.g., `greeting`, `invitation`, `venue`).
+4. Verify the transcription is correct — edit it if needed.
+5. Click **Upload**.
+
+
+Recordings are scoped to a specific **provider and Voice ID**. If you change either, you will need to re-upload your recordings to ensure consistency between the recorded audio and the cloned voice used for dynamic responses.
+
+
+## Step 4: Build the workflow
+
+Open your agent's workflow and write the conversation flow in natural language. To insert a recording, type **`@`** in the prompt editor — this will show a list of all available recordings scoped to your current Voice ID.
+
+For any user question that falls outside your recordings, the agent automatically generates a dynamic response using the LLM, which is then synthesized using your cloned voice via TTS.
+
+## Tips for best results
+
+- **Record in a quiet environment** to improve audio quality and consistency with the cloned voice.
+- **Use pro cloning services** (when available) and provide more sample audio for a higher-quality voice clone.
+- **Keep recordings concise** — short, focused clips work best for specific conversation moments.
+- **Review call recordings** after testing to identify where the transition between pre-recorded and dynamic audio can be improved.
diff --git a/ui/src/app/workflow/[workflowId]/components/RecordingsDialog.tsx b/ui/src/app/workflow/[workflowId]/components/RecordingsDialog.tsx
index b917f2b..2f0692f 100644
--- a/ui/src/app/workflow/[workflowId]/components/RecordingsDialog.tsx
+++ b/ui/src/app/workflow/[workflowId]/components/RecordingsDialog.tsx
@@ -1,9 +1,10 @@
-import { Loader2, Mic, Square, Trash2Icon, Upload } from "lucide-react";
+import { Loader2, Mic, Pause, Play, Square, Trash2Icon, Upload } from "lucide-react";
import { useCallback, useEffect, useRef, useState } from "react";
import {
createRecordingApiV1WorkflowRecordingsPost,
deleteRecordingApiV1WorkflowRecordingsRecordingIdDelete,
+ getSignedUrlApiV1S3SignedUrlGet,
getUploadUrlApiV1WorkflowRecordingsUploadUrlPost,
listRecordingsApiV1WorkflowRecordingsGet,
transcribeAudioApiV1WorkflowRecordingsTranscribePost,
@@ -58,6 +59,8 @@ export const RecordingsDialog = ({
const [recordingStep, setRecordingStep] = useState("idle");
const [recordingFilename, setRecordingFilename] = useState("");
const [recordingDuration, setRecordingDuration] = useState(0);
+ const [playingId, setPlayingId] = useState(null);
+ const audioRef = useRef(null);
const mediaRecorderRef = useRef(null);
const audioChunksRef = useRef([]);
const recordingTimerRef = useRef | null>(null);
@@ -110,6 +113,14 @@ export const RecordingsDialog = ({
setRecordingDuration(0);
}, []);
+ const stopPlayback = useCallback(() => {
+ if (audioRef.current) {
+ audioRef.current.pause();
+ audioRef.current = null;
+ }
+ setPlayingId(null);
+ }, []);
+
useEffect(() => {
if (open) {
fetchRecordings();
@@ -125,8 +136,9 @@ export const RecordingsDialog = ({
if (!open) {
stopRecording();
stopRecordingTimer();
+ stopPlayback();
}
- }, [open, stopRecording, stopRecordingTimer]);
+ }, [open, stopRecording, stopRecordingTimer, stopPlayback]);
const transcribeFile = async (file: File) => {
setRecordingStep("transcribing");
@@ -295,6 +307,33 @@ export const RecordingsDialog = ({
}
};
+ const handlePlay = async (rec: RecordingResponseSchema) => {
+ if (playingId === rec.recording_id) {
+ stopPlayback();
+ return;
+ }
+ stopPlayback();
+ try {
+ const result = await getSignedUrlApiV1S3SignedUrlGet({
+ query: {
+ key: rec.storage_key,
+ storage_backend: rec.storage_backend,
+ },
+ });
+ if (!result.data?.url) {
+ setError("Failed to get audio URL");
+ return;
+ }
+ const audio = new Audio(result.data.url);
+ audio.onended = () => setPlayingId(null);
+ audioRef.current = audio;
+ setPlayingId(rec.recording_id);
+ await audio.play();
+ } catch {
+ setError("Failed to play recording");
+ }
+ };
+
const isRecording = recordingStep === "recording";
const isTranscribing = recordingStep === "transcribing";
const isBusy = uploading || isRecording || isTranscribing;
@@ -540,6 +579,17 @@ export const RecordingsDialog = ({
{rec.transcript}