chore: add custom recordings documentation

This commit is contained in:
Abhishek Kumar 2026-03-25 15:44:54 +05:30
parent 2fa4191d9b
commit dc800bdd63
6 changed files with 211 additions and 37 deletions

View file

@ -40,6 +40,38 @@ class PresignedUploadUrlResponse(BaseModel):
router = APIRouter(prefix="/s3", tags=["s3"])
def _extract_org_id_from_key(key: str) -> Optional[int]:
"""Try to extract an organization ID from a storage key.
Matches keys of the form ``{prefix}/{org_id}/...`` where *org_id* is a
positive integer. Returns ``None`` when the pattern does not match.
"""
parts = key.split("/")
if len(parts) >= 3 and parts[1].isdigit():
return int(parts[1])
return None
def _extract_legacy_workflow_run_id(key: str) -> Optional[int]:
"""Extract a workflow_run_id from legacy key formats.
Supports:
- ``transcripts/{run_id}.txt``
- ``recordings/{run_id}.wav``
Returns ``None`` when the key does not match a legacy pattern.
"""
if key.startswith("transcripts/") and key.endswith(".txt"):
run_id_str = key[len("transcripts/") : -4]
elif key.startswith("recordings/") and key.endswith(".wav"):
run_id_str = key[len("recordings/") : -4]
else:
return None
return int(run_id_str) if run_id_str.isdigit() else None
# Keep for backward compat with file-metadata endpoint
async def _validate_and_extract_workflow_run_id(
key: str, allow_special_paths: bool = False
) -> Optional[int]:
@ -118,64 +150,68 @@ async def get_signed_url(
key: Annotated[str, Query(description="S3 object key")],
expires_in: int = 3600,
inline: bool = False,
storage_backend: Annotated[
Optional[str],
Query(
description="Storage backend to use (e.g. 'minio', 's3'). "
"When omitted the backend is inferred from the resource."
),
] = None,
user=Depends(get_user),
):
"""Return a short-lived signed URL for a transcript or recording file stored on S3.
"""Return a short-lived signed URL for a file stored on S3 / MinIO.
Access Control:
* Keys that embed an organization ID (``{prefix}/{org_id}/...``) are
authorized by matching the org_id against the requesting user's
organization.
* Legacy keys (``recordings/{run_id}.wav``, ``transcripts/{run_id}.txt``)
are authorized via the workflow run they belong to.
* Superusers can request any key.
* Regular users can only request resources belonging to **their** workflow runs.
"""
# Validate key and extract workflow_run_id (don't allow special paths for signed URLs)
run_id = await _validate_and_extract_workflow_run_id(key, allow_special_paths=False)
if run_id is None:
raise HTTPException(status_code=400, detail="Invalid key format")
# ------------------------------------------------------------------
# 1. Authorize
# ------------------------------------------------------------------
workflow_run = None
# Authorize and get workflow run
workflow_run = await _authorize_and_get_workflow_run(run_id, user)
org_id = _extract_org_id_from_key(key)
if org_id is not None:
# Generic org-based auth
if not user.is_superuser and org_id != user.selected_organization_id:
raise HTTPException(status_code=403, detail="Access denied")
else:
# Legacy workflow-run-based auth
run_id = _extract_legacy_workflow_run_id(key)
if run_id is None:
raise HTTPException(status_code=400, detail="Invalid key format")
workflow_run = await _authorize_and_get_workflow_run(run_id, user)
# ------------------------------------------------------------------
# 3. Generate the signed URL using the correct storage backend
# 2. Resolve storage backend
# ------------------------------------------------------------------
try:
# Use the storage backend recorded when the file was uploaded
if (
if storage_backend:
storage = get_storage_for_backend(storage_backend)
elif (
workflow_run
and hasattr(workflow_run, "storage_backend")
and workflow_run.storage_backend
):
backend = workflow_run.storage_backend
storage = get_storage_for_backend(backend)
logger.info(
f"DOWNLOAD: Using stored {backend} (value: {backend}) for signed URL generation - workflow_run_id: {run_id}, key: {key}"
)
storage = get_storage_for_backend(workflow_run.storage_backend)
else:
# Fallback to current storage for legacy records without storage_backend
storage = storage_fs
current_backend = StorageBackend.get_current_backend()
logger.warning(
f"DOWNLOAD: No storage_backend found for workflow run {run_id}, falling back to current {current_backend.name} - key: {key}"
)
# ------------------------------------------------------------------
# 3. Generate the signed URL
# ------------------------------------------------------------------
url = await storage.aget_signed_url(
key, expiration=expires_in, force_inline=inline
)
if not url:
raise HTTPException(status_code=500, detail="Failed to generate signed URL")
# Log successful URL generation
backend_info = (
f"stored {backend}"
if workflow_run
and hasattr(workflow_run, "storage_backend")
and workflow_run.storage_backend
else f"current {StorageBackend.get_current_backend().name}"
)
logger.info(
f"Successfully generated signed URL using {backend_info} - expires in {expires_in}s"
)
logger.info(f"Generated signed URL for key={key}, expires_in={expires_in}s")
return {"url": url, "expires_in": expires_in}
except ClientError as exc:
logger.error(f"Error generating signed URL: {exc}")

View file

@ -54,6 +54,7 @@
"pages": [
"voice-agent/introduction",
"voice-agent/editing-a-workflow",
"voice-agent/custom-recordings",
"voice-agent/template-variables",
{
"group": "Tools",

View file

@ -0,0 +1,79 @@
---
title: "Custom Recordings"
description: "Build hybrid voice agents that combine pre-recorded audio with dynamic text generation for lower latency, reduced TTS costs, and natural-sounding conversations."
---
Custom recordings allow you to build **hybrid voice agents** that use your own pre-recorded audio for key parts of the conversation, while falling back to LLM-generated speech (via a cloned voice) for dynamic responses. This gives you the best of both worlds — the emotional depth of real human speech and the flexibility of AI-generated dialogue.
<iframe
width="560"
height="315"
src="https://www.youtube.com/embed/1uZqhG0_cIo"
title="YouTube video player"
frameborder="0"
allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture; web-share"
referrerpolicy="strict-origin-when-cross-origin"
allowfullscreen>
</iframe>
## Why use custom recordings?
- **Reduced TTS cost** — Pre-recorded audio is played directly, so you are not charged for TTS synthesis on those segments.
- **Emotional variance** — Real recordings carry natural intonation and emotion that TTS cannot fully replicate.
- **Lower latency** — Playing a pre-recorded clip is faster than synthesizing text at runtime.
## Prerequisites
- A TTS provider that supports **voice cloning** (e.g., Cartesia, ElevenLabs, or Deepgram).
- An API key for your chosen TTS provider, configured in [Voice settings](/configurations/voice).
## Step 1: Clone your voice
Clone your voice with your TTS provider so that dynamically generated speech sounds similar to your recordings. For example, with Cartesia:
1. Go to Cartesia and navigate to **Instant Clone**.
2. Record a short audio clip (up to 10 seconds) of your voice.
3. Give the clone a name and select your language.
4. Copy the **Voice ID** — you will need it in the next step.
<Note>
You can use any TTS provider that supports voice cloning. The steps will vary by provider, but the key output is always a **Voice ID** tied to your cloned voice.
</Note>
## Step 2: Configure the cloned voice in Dograh
1. Go to your agent's **Model Configuration** in the Dograh dashboard.
2. Under voice settings, select **Add Voice ID manually**.
3. Paste the Voice ID from your cloned voice.
4. Make sure the **provider** matches where you cloned your voice (e.g., Cartesia).
5. Enter the provider's API key if you haven't already.
6. Save the configuration.
## Step 3: Upload recordings
Navigate to your agent in the workflow builder and open the **Recordings** panel. You can either upload pre-recorded audio files or record directly in the browser.
For each recording:
1. Click **Record** (or upload a file).
2. Speak the exact phrase you want the agent to use.
3. Give the recording a descriptive name (e.g., `greeting`, `invitation`, `venue`).
4. Verify the transcription is correct — edit it if needed.
5. Click **Upload**.
<Warning>
Recordings are scoped to a specific **provider and Voice ID**. If you change either, you will need to re-upload your recordings to ensure consistency between the recorded audio and the cloned voice used for dynamic responses.
</Warning>
## Step 4: Build the workflow
Open your agent's workflow and write the conversation flow in natural language. To insert a recording, type **`@`** in the prompt editor — this will show a list of all available recordings scoped to your current Voice ID.
For any user question that falls outside your recordings, the agent automatically generates a dynamic response using the LLM, which is then synthesized using your cloned voice via TTS.
## Tips for best results
- **Record in a quiet environment** to improve audio quality and consistency with the cloned voice.
- **Use pro cloning services** (when available) and provide more sample audio for a higher-quality voice clone.
- **Keep recordings concise** — short, focused clips work best for specific conversation moments.
- **Review call recordings** after testing to identify where the transition between pre-recorded and dynamic audio can be improved.

View file

@ -1,9 +1,10 @@
import { Loader2, Mic, Square, Trash2Icon, Upload } from "lucide-react";
import { Loader2, Mic, Pause, Play, Square, Trash2Icon, Upload } from "lucide-react";
import { useCallback, useEffect, useRef, useState } from "react";
import {
createRecordingApiV1WorkflowRecordingsPost,
deleteRecordingApiV1WorkflowRecordingsRecordingIdDelete,
getSignedUrlApiV1S3SignedUrlGet,
getUploadUrlApiV1WorkflowRecordingsUploadUrlPost,
listRecordingsApiV1WorkflowRecordingsGet,
transcribeAudioApiV1WorkflowRecordingsTranscribePost,
@ -58,6 +59,8 @@ export const RecordingsDialog = ({
const [recordingStep, setRecordingStep] = useState<RecordingStep>("idle");
const [recordingFilename, setRecordingFilename] = useState("");
const [recordingDuration, setRecordingDuration] = useState(0);
const [playingId, setPlayingId] = useState<string | null>(null);
const audioRef = useRef<HTMLAudioElement | null>(null);
const mediaRecorderRef = useRef<MediaRecorder | null>(null);
const audioChunksRef = useRef<Blob[]>([]);
const recordingTimerRef = useRef<ReturnType<typeof setInterval> | null>(null);
@ -110,6 +113,14 @@ export const RecordingsDialog = ({
setRecordingDuration(0);
}, []);
const stopPlayback = useCallback(() => {
if (audioRef.current) {
audioRef.current.pause();
audioRef.current = null;
}
setPlayingId(null);
}, []);
useEffect(() => {
if (open) {
fetchRecordings();
@ -125,8 +136,9 @@ export const RecordingsDialog = ({
if (!open) {
stopRecording();
stopRecordingTimer();
stopPlayback();
}
}, [open, stopRecording, stopRecordingTimer]);
}, [open, stopRecording, stopRecordingTimer, stopPlayback]);
const transcribeFile = async (file: File) => {
setRecordingStep("transcribing");
@ -295,6 +307,33 @@ export const RecordingsDialog = ({
}
};
const handlePlay = async (rec: RecordingResponseSchema) => {
if (playingId === rec.recording_id) {
stopPlayback();
return;
}
stopPlayback();
try {
const result = await getSignedUrlApiV1S3SignedUrlGet({
query: {
key: rec.storage_key,
storage_backend: rec.storage_backend,
},
});
if (!result.data?.url) {
setError("Failed to get audio URL");
return;
}
const audio = new Audio(result.data.url);
audio.onended = () => setPlayingId(null);
audioRef.current = audio;
setPlayingId(rec.recording_id);
await audio.play();
} catch {
setError("Failed to play recording");
}
};
const isRecording = recordingStep === "recording";
const isTranscribing = recordingStep === "transcribing";
const isBusy = uploading || isRecording || isTranscribing;
@ -540,6 +579,17 @@ export const RecordingsDialog = ({
{rec.transcript}
</p>
</div>
<Button
size="sm"
variant="ghost"
onClick={() => handlePlay(rec)}
>
{playingId === rec.recording_id ? (
<Pause className="w-4 h-4" />
) : (
<Play className="w-4 h-4" />
)}
</Button>
<Button
size="sm"
variant="ghost"

View file

@ -1064,11 +1064,15 @@ export const getCampaignDefaultsApiV1OrganizationsCampaignDefaultsGet = <ThrowOn
/**
* Generate a signed S3 URL
* Return a short-lived signed URL for a transcript or recording file stored on S3.
* Return a short-lived signed URL for a file stored on S3 / MinIO.
*
* Access Control:
* * Keys that embed an organization ID (``{prefix}/{org_id}/...``) are
* authorized by matching the org_id against the requesting user's
* organization.
* * Legacy keys (``recordings/{run_id}.wav``, ``transcripts/{run_id}.txt``)
* are authorized via the workflow run they belong to.
* * Superusers can request any key.
* * Regular users can only request resources belonging to **their** workflow runs.
*/
export const getSignedUrlApiV1S3SignedUrlGet = <ThrowOnError extends boolean = false>(options: Options<GetSignedUrlApiV1S3SignedUrlGetData, ThrowOnError>) => {
return (options.client ?? _heyApiClient).get<GetSignedUrlApiV1S3SignedUrlGetResponse, GetSignedUrlApiV1S3SignedUrlGetError, ThrowOnError>({

View file

@ -3964,6 +3964,10 @@ export type GetSignedUrlApiV1S3SignedUrlGetData = {
key: string;
expires_in?: number;
inline?: boolean;
/**
* Storage backend to use (e.g. 'minio', 's3'). When omitted the backend is inferred from the resource.
*/
storage_backend?: string | null;
};
url: '/api/v1/s3/signed-url';
};