mirror of
https://github.com/dograh-hq/dograh.git
synced 2026-06-07 07:55:16 +02:00
chore: add custom recordings documentation
This commit is contained in:
parent
2fa4191d9b
commit
dc800bdd63
6 changed files with 211 additions and 37 deletions
|
|
@ -40,6 +40,38 @@ class PresignedUploadUrlResponse(BaseModel):
|
|||
router = APIRouter(prefix="/s3", tags=["s3"])
|
||||
|
||||
|
||||
def _extract_org_id_from_key(key: str) -> Optional[int]:
|
||||
"""Try to extract an organization ID from a storage key.
|
||||
|
||||
Matches keys of the form ``{prefix}/{org_id}/...`` where *org_id* is a
|
||||
positive integer. Returns ``None`` when the pattern does not match.
|
||||
"""
|
||||
parts = key.split("/")
|
||||
if len(parts) >= 3 and parts[1].isdigit():
|
||||
return int(parts[1])
|
||||
return None
|
||||
|
||||
|
||||
def _extract_legacy_workflow_run_id(key: str) -> Optional[int]:
|
||||
"""Extract a workflow_run_id from legacy key formats.
|
||||
|
||||
Supports:
|
||||
- ``transcripts/{run_id}.txt``
|
||||
- ``recordings/{run_id}.wav``
|
||||
|
||||
Returns ``None`` when the key does not match a legacy pattern.
|
||||
"""
|
||||
if key.startswith("transcripts/") and key.endswith(".txt"):
|
||||
run_id_str = key[len("transcripts/") : -4]
|
||||
elif key.startswith("recordings/") and key.endswith(".wav"):
|
||||
run_id_str = key[len("recordings/") : -4]
|
||||
else:
|
||||
return None
|
||||
|
||||
return int(run_id_str) if run_id_str.isdigit() else None
|
||||
|
||||
|
||||
# Keep for backward compat with file-metadata endpoint
|
||||
async def _validate_and_extract_workflow_run_id(
|
||||
key: str, allow_special_paths: bool = False
|
||||
) -> Optional[int]:
|
||||
|
|
@ -118,64 +150,68 @@ async def get_signed_url(
|
|||
key: Annotated[str, Query(description="S3 object key")],
|
||||
expires_in: int = 3600,
|
||||
inline: bool = False,
|
||||
storage_backend: Annotated[
|
||||
Optional[str],
|
||||
Query(
|
||||
description="Storage backend to use (e.g. 'minio', 's3'). "
|
||||
"When omitted the backend is inferred from the resource."
|
||||
),
|
||||
] = None,
|
||||
user=Depends(get_user),
|
||||
):
|
||||
"""Return a short-lived signed URL for a transcript or recording file stored on S3.
|
||||
"""Return a short-lived signed URL for a file stored on S3 / MinIO.
|
||||
|
||||
Access Control:
|
||||
* Keys that embed an organization ID (``{prefix}/{org_id}/...``) are
|
||||
authorized by matching the org_id against the requesting user's
|
||||
organization.
|
||||
* Legacy keys (``recordings/{run_id}.wav``, ``transcripts/{run_id}.txt``)
|
||||
are authorized via the workflow run they belong to.
|
||||
* Superusers can request any key.
|
||||
* Regular users can only request resources belonging to **their** workflow runs.
|
||||
"""
|
||||
|
||||
# Validate key and extract workflow_run_id (don't allow special paths for signed URLs)
|
||||
run_id = await _validate_and_extract_workflow_run_id(key, allow_special_paths=False)
|
||||
if run_id is None:
|
||||
raise HTTPException(status_code=400, detail="Invalid key format")
|
||||
# ------------------------------------------------------------------
|
||||
# 1. Authorize
|
||||
# ------------------------------------------------------------------
|
||||
workflow_run = None
|
||||
|
||||
# Authorize and get workflow run
|
||||
workflow_run = await _authorize_and_get_workflow_run(run_id, user)
|
||||
org_id = _extract_org_id_from_key(key)
|
||||
if org_id is not None:
|
||||
# Generic org-based auth
|
||||
if not user.is_superuser and org_id != user.selected_organization_id:
|
||||
raise HTTPException(status_code=403, detail="Access denied")
|
||||
else:
|
||||
# Legacy workflow-run-based auth
|
||||
run_id = _extract_legacy_workflow_run_id(key)
|
||||
if run_id is None:
|
||||
raise HTTPException(status_code=400, detail="Invalid key format")
|
||||
workflow_run = await _authorize_and_get_workflow_run(run_id, user)
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# 3. Generate the signed URL using the correct storage backend
|
||||
# 2. Resolve storage backend
|
||||
# ------------------------------------------------------------------
|
||||
try:
|
||||
# Use the storage backend recorded when the file was uploaded
|
||||
if (
|
||||
if storage_backend:
|
||||
storage = get_storage_for_backend(storage_backend)
|
||||
elif (
|
||||
workflow_run
|
||||
and hasattr(workflow_run, "storage_backend")
|
||||
and workflow_run.storage_backend
|
||||
):
|
||||
backend = workflow_run.storage_backend
|
||||
storage = get_storage_for_backend(backend)
|
||||
logger.info(
|
||||
f"DOWNLOAD: Using stored {backend} (value: {backend}) for signed URL generation - workflow_run_id: {run_id}, key: {key}"
|
||||
)
|
||||
storage = get_storage_for_backend(workflow_run.storage_backend)
|
||||
else:
|
||||
# Fallback to current storage for legacy records without storage_backend
|
||||
storage = storage_fs
|
||||
current_backend = StorageBackend.get_current_backend()
|
||||
logger.warning(
|
||||
f"DOWNLOAD: No storage_backend found for workflow run {run_id}, falling back to current {current_backend.name} - key: {key}"
|
||||
)
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# 3. Generate the signed URL
|
||||
# ------------------------------------------------------------------
|
||||
url = await storage.aget_signed_url(
|
||||
key, expiration=expires_in, force_inline=inline
|
||||
)
|
||||
if not url:
|
||||
raise HTTPException(status_code=500, detail="Failed to generate signed URL")
|
||||
|
||||
# Log successful URL generation
|
||||
backend_info = (
|
||||
f"stored {backend}"
|
||||
if workflow_run
|
||||
and hasattr(workflow_run, "storage_backend")
|
||||
and workflow_run.storage_backend
|
||||
else f"current {StorageBackend.get_current_backend().name}"
|
||||
)
|
||||
logger.info(
|
||||
f"Successfully generated signed URL using {backend_info} - expires in {expires_in}s"
|
||||
)
|
||||
|
||||
logger.info(f"Generated signed URL for key={key}, expires_in={expires_in}s")
|
||||
return {"url": url, "expires_in": expires_in}
|
||||
except ClientError as exc:
|
||||
logger.error(f"Error generating signed URL: {exc}")
|
||||
|
|
|
|||
|
|
@ -54,6 +54,7 @@
|
|||
"pages": [
|
||||
"voice-agent/introduction",
|
||||
"voice-agent/editing-a-workflow",
|
||||
"voice-agent/custom-recordings",
|
||||
"voice-agent/template-variables",
|
||||
{
|
||||
"group": "Tools",
|
||||
|
|
|
|||
79
docs/voice-agent/custom-recordings.mdx
Normal file
79
docs/voice-agent/custom-recordings.mdx
Normal file
|
|
@ -0,0 +1,79 @@
|
|||
---
|
||||
title: "Custom Recordings"
|
||||
description: "Build hybrid voice agents that combine pre-recorded audio with dynamic text generation for lower latency, reduced TTS costs, and natural-sounding conversations."
|
||||
---
|
||||
|
||||
Custom recordings allow you to build **hybrid voice agents** that use your own pre-recorded audio for key parts of the conversation, while falling back to LLM-generated speech (via a cloned voice) for dynamic responses. This gives you the best of both worlds — the emotional depth of real human speech and the flexibility of AI-generated dialogue.
|
||||
|
||||
<iframe
|
||||
width="560"
|
||||
height="315"
|
||||
src="https://www.youtube.com/embed/1uZqhG0_cIo"
|
||||
title="YouTube video player"
|
||||
frameborder="0"
|
||||
allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture; web-share"
|
||||
referrerpolicy="strict-origin-when-cross-origin"
|
||||
allowfullscreen>
|
||||
</iframe>
|
||||
|
||||
## Why use custom recordings?
|
||||
|
||||
- **Reduced TTS cost** — Pre-recorded audio is played directly, so you are not charged for TTS synthesis on those segments.
|
||||
- **Emotional variance** — Real recordings carry natural intonation and emotion that TTS cannot fully replicate.
|
||||
- **Lower latency** — Playing a pre-recorded clip is faster than synthesizing text at runtime.
|
||||
|
||||
## Prerequisites
|
||||
|
||||
- A TTS provider that supports **voice cloning** (e.g., Cartesia, ElevenLabs, or Deepgram).
|
||||
- An API key for your chosen TTS provider, configured in [Voice settings](/configurations/voice).
|
||||
|
||||
## Step 1: Clone your voice
|
||||
|
||||
Clone your voice with your TTS provider so that dynamically generated speech sounds similar to your recordings. For example, with Cartesia:
|
||||
|
||||
1. Go to Cartesia and navigate to **Instant Clone**.
|
||||
2. Record a short audio clip (up to 10 seconds) of your voice.
|
||||
3. Give the clone a name and select your language.
|
||||
4. Copy the **Voice ID** — you will need it in the next step.
|
||||
|
||||
<Note>
|
||||
You can use any TTS provider that supports voice cloning. The steps will vary by provider, but the key output is always a **Voice ID** tied to your cloned voice.
|
||||
</Note>
|
||||
|
||||
## Step 2: Configure the cloned voice in Dograh
|
||||
|
||||
1. Go to your agent's **Model Configuration** in the Dograh dashboard.
|
||||
2. Under voice settings, select **Add Voice ID manually**.
|
||||
3. Paste the Voice ID from your cloned voice.
|
||||
4. Make sure the **provider** matches where you cloned your voice (e.g., Cartesia).
|
||||
5. Enter the provider's API key if you haven't already.
|
||||
6. Save the configuration.
|
||||
|
||||
## Step 3: Upload recordings
|
||||
|
||||
Navigate to your agent in the workflow builder and open the **Recordings** panel. You can either upload pre-recorded audio files or record directly in the browser.
|
||||
|
||||
For each recording:
|
||||
|
||||
1. Click **Record** (or upload a file).
|
||||
2. Speak the exact phrase you want the agent to use.
|
||||
3. Give the recording a descriptive name (e.g., `greeting`, `invitation`, `venue`).
|
||||
4. Verify the transcription is correct — edit it if needed.
|
||||
5. Click **Upload**.
|
||||
|
||||
<Warning>
|
||||
Recordings are scoped to a specific **provider and Voice ID**. If you change either, you will need to re-upload your recordings to ensure consistency between the recorded audio and the cloned voice used for dynamic responses.
|
||||
</Warning>
|
||||
|
||||
## Step 4: Build the workflow
|
||||
|
||||
Open your agent's workflow and write the conversation flow in natural language. To insert a recording, type **`@`** in the prompt editor — this will show a list of all available recordings scoped to your current Voice ID.
|
||||
|
||||
For any user question that falls outside your recordings, the agent automatically generates a dynamic response using the LLM, which is then synthesized using your cloned voice via TTS.
|
||||
|
||||
## Tips for best results
|
||||
|
||||
- **Record in a quiet environment** to improve audio quality and consistency with the cloned voice.
|
||||
- **Use pro cloning services** (when available) and provide more sample audio for a higher-quality voice clone.
|
||||
- **Keep recordings concise** — short, focused clips work best for specific conversation moments.
|
||||
- **Review call recordings** after testing to identify where the transition between pre-recorded and dynamic audio can be improved.
|
||||
|
|
@ -1,9 +1,10 @@
|
|||
import { Loader2, Mic, Square, Trash2Icon, Upload } from "lucide-react";
|
||||
import { Loader2, Mic, Pause, Play, Square, Trash2Icon, Upload } from "lucide-react";
|
||||
import { useCallback, useEffect, useRef, useState } from "react";
|
||||
|
||||
import {
|
||||
createRecordingApiV1WorkflowRecordingsPost,
|
||||
deleteRecordingApiV1WorkflowRecordingsRecordingIdDelete,
|
||||
getSignedUrlApiV1S3SignedUrlGet,
|
||||
getUploadUrlApiV1WorkflowRecordingsUploadUrlPost,
|
||||
listRecordingsApiV1WorkflowRecordingsGet,
|
||||
transcribeAudioApiV1WorkflowRecordingsTranscribePost,
|
||||
|
|
@ -58,6 +59,8 @@ export const RecordingsDialog = ({
|
|||
const [recordingStep, setRecordingStep] = useState<RecordingStep>("idle");
|
||||
const [recordingFilename, setRecordingFilename] = useState("");
|
||||
const [recordingDuration, setRecordingDuration] = useState(0);
|
||||
const [playingId, setPlayingId] = useState<string | null>(null);
|
||||
const audioRef = useRef<HTMLAudioElement | null>(null);
|
||||
const mediaRecorderRef = useRef<MediaRecorder | null>(null);
|
||||
const audioChunksRef = useRef<Blob[]>([]);
|
||||
const recordingTimerRef = useRef<ReturnType<typeof setInterval> | null>(null);
|
||||
|
|
@ -110,6 +113,14 @@ export const RecordingsDialog = ({
|
|||
setRecordingDuration(0);
|
||||
}, []);
|
||||
|
||||
const stopPlayback = useCallback(() => {
|
||||
if (audioRef.current) {
|
||||
audioRef.current.pause();
|
||||
audioRef.current = null;
|
||||
}
|
||||
setPlayingId(null);
|
||||
}, []);
|
||||
|
||||
useEffect(() => {
|
||||
if (open) {
|
||||
fetchRecordings();
|
||||
|
|
@ -125,8 +136,9 @@ export const RecordingsDialog = ({
|
|||
if (!open) {
|
||||
stopRecording();
|
||||
stopRecordingTimer();
|
||||
stopPlayback();
|
||||
}
|
||||
}, [open, stopRecording, stopRecordingTimer]);
|
||||
}, [open, stopRecording, stopRecordingTimer, stopPlayback]);
|
||||
|
||||
const transcribeFile = async (file: File) => {
|
||||
setRecordingStep("transcribing");
|
||||
|
|
@ -295,6 +307,33 @@ export const RecordingsDialog = ({
|
|||
}
|
||||
};
|
||||
|
||||
const handlePlay = async (rec: RecordingResponseSchema) => {
|
||||
if (playingId === rec.recording_id) {
|
||||
stopPlayback();
|
||||
return;
|
||||
}
|
||||
stopPlayback();
|
||||
try {
|
||||
const result = await getSignedUrlApiV1S3SignedUrlGet({
|
||||
query: {
|
||||
key: rec.storage_key,
|
||||
storage_backend: rec.storage_backend,
|
||||
},
|
||||
});
|
||||
if (!result.data?.url) {
|
||||
setError("Failed to get audio URL");
|
||||
return;
|
||||
}
|
||||
const audio = new Audio(result.data.url);
|
||||
audio.onended = () => setPlayingId(null);
|
||||
audioRef.current = audio;
|
||||
setPlayingId(rec.recording_id);
|
||||
await audio.play();
|
||||
} catch {
|
||||
setError("Failed to play recording");
|
||||
}
|
||||
};
|
||||
|
||||
const isRecording = recordingStep === "recording";
|
||||
const isTranscribing = recordingStep === "transcribing";
|
||||
const isBusy = uploading || isRecording || isTranscribing;
|
||||
|
|
@ -540,6 +579,17 @@ export const RecordingsDialog = ({
|
|||
{rec.transcript}
|
||||
</p>
|
||||
</div>
|
||||
<Button
|
||||
size="sm"
|
||||
variant="ghost"
|
||||
onClick={() => handlePlay(rec)}
|
||||
>
|
||||
{playingId === rec.recording_id ? (
|
||||
<Pause className="w-4 h-4" />
|
||||
) : (
|
||||
<Play className="w-4 h-4" />
|
||||
)}
|
||||
</Button>
|
||||
<Button
|
||||
size="sm"
|
||||
variant="ghost"
|
||||
|
|
|
|||
|
|
@ -1064,11 +1064,15 @@ export const getCampaignDefaultsApiV1OrganizationsCampaignDefaultsGet = <ThrowOn
|
|||
|
||||
/**
|
||||
* Generate a signed S3 URL
|
||||
* Return a short-lived signed URL for a transcript or recording file stored on S3.
|
||||
* Return a short-lived signed URL for a file stored on S3 / MinIO.
|
||||
*
|
||||
* Access Control:
|
||||
* * Keys that embed an organization ID (``{prefix}/{org_id}/...``) are
|
||||
* authorized by matching the org_id against the requesting user's
|
||||
* organization.
|
||||
* * Legacy keys (``recordings/{run_id}.wav``, ``transcripts/{run_id}.txt``)
|
||||
* are authorized via the workflow run they belong to.
|
||||
* * Superusers can request any key.
|
||||
* * Regular users can only request resources belonging to **their** workflow runs.
|
||||
*/
|
||||
export const getSignedUrlApiV1S3SignedUrlGet = <ThrowOnError extends boolean = false>(options: Options<GetSignedUrlApiV1S3SignedUrlGetData, ThrowOnError>) => {
|
||||
return (options.client ?? _heyApiClient).get<GetSignedUrlApiV1S3SignedUrlGetResponse, GetSignedUrlApiV1S3SignedUrlGetError, ThrowOnError>({
|
||||
|
|
|
|||
|
|
@ -3964,6 +3964,10 @@ export type GetSignedUrlApiV1S3SignedUrlGetData = {
|
|||
key: string;
|
||||
expires_in?: number;
|
||||
inline?: boolean;
|
||||
/**
|
||||
* Storage backend to use (e.g. 'minio', 's3'). When omitted the backend is inferred from the resource.
|
||||
*/
|
||||
storage_backend?: string | null;
|
||||
};
|
||||
url: '/api/v1/s3/signed-url';
|
||||
};
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue