feat: allow uploading recording as part of node transition

2026-06-10 08:05:22 +02:00 · 2026-04-10 11:54:00 +05:30 · 2026-04-10 11:54:00 +05:30 · 65c76ca7ff
commit 65c76ca7ff
parent bb5f56bfb7
36 changed files with 2255 additions and 201 deletions
--- a/ui/src/components/flow/TextOrAudioInput.tsx
+++ b/ui/src/components/flow/TextOrAudioInput.tsx
@ -0,0 +1,97 @@
+import type { RecordingResponseSchema } from "@/client/types.gen";
+import { Label } from "@/components/ui/label";
+import { RadioGroup, RadioGroupItem } from "@/components/ui/radio-group";
+import { Select, SelectContent, SelectItem, SelectTrigger, SelectValue } from "@/components/ui/select";
+
+interface TextOrAudioInputProps {
+    type: 'text' | 'audio';
+    onTypeChange: (type: 'text' | 'audio') => void;
+    recordingId: string;
+    onRecordingIdChange: (id: string) => void;
+    recordings?: RecordingResponseSchema[];
+    /** Rendered when type === 'text' */
+    children: React.ReactNode;
+}
+
+export function TextOrAudioInput({
+    type,
+    onTypeChange,
+    recordingId,
+    onRecordingIdChange,
+    recordings = [],
+    children,
+}: TextOrAudioInputProps) {
+    return (
+        <>
+            <RadioGroup
+                value={type}
+                onValueChange={(value) => onTypeChange(value as 'text' | 'audio')}
+                className="flex items-center gap-4"
+            >
+                <div className="flex items-center gap-2">
+                    <RadioGroupItem value="text" id="toa-text" />
+                    <Label htmlFor="toa-text" className="font-normal cursor-pointer">Text</Label>
+                </div>
+                <div className="flex items-center gap-2">
+                    <RadioGroupItem value="audio" id="toa-audio" />
+                    <Label htmlFor="toa-audio" className="font-normal cursor-pointer">Audio</Label>
+                </div>
+            </RadioGroup>
+            {type === 'text' ? (
+                children
+            ) : (
+                <RecordingSelect
+                    value={recordingId}
+                    onChange={onRecordingIdChange}
+                    recordings={recordings}
+                />
+            )}
+        </>
+    );
+}
+
+interface RecordingSelectProps {
+    value: string;
+    onChange: (id: string) => void;
+    recordings: RecordingResponseSchema[];
+}
+
+/**
+ * Dropdown to select a pre-recorded audio file.
+ * Re-exported so callers that only need the dropdown (e.g. tool configs with
+ * their own none/custom/audio radio) can use it directly.
+ */
+export function RecordingSelect({ value, onChange, recordings }: RecordingSelectProps) {
+    return (
+        <div className="space-y-2">
+            <Label className="text-xs text-muted-foreground">
+                Select a pre-recorded audio file to play.
+            </Label>
+            <Select value={value} onValueChange={onChange}>
+                <SelectTrigger className="w-full">
+                    <SelectValue placeholder="Select a recording" />
+                </SelectTrigger>
+                <SelectContent>
+                    {recordings.length === 0 ? (
+                        <SelectItem value="__empty__" disabled>
+                            No recordings available
+                        </SelectItem>
+                    ) : (
+                        recordings.map((r) => (
+                            <SelectItem key={r.recording_id} value={r.recording_id}>
+                                <span className="truncate">
+                                    {(r.metadata?.original_filename as string) || r.recording_id}
+                                </span>
+                                {r.transcript && (
+                                    <span className="text-xs text-muted-foreground ml-2 truncate">
+                                        — {r.transcript}
+                                    </span>
+                                )}
+                            </SelectItem>
+                        ))
+                    )}
+                </SelectContent>
+            </Select>
+        </div>
+    );
+}
--- a/ui/src/components/flow/edges/CustomEdge.tsx
+++ b/ui/src/components/flow/edges/CustomEdge.tsx
@ -4,6 +4,7 @@ import { useCallback, useEffect, useState } from 'react';

 import { useWorkflow, useWorkflowOptional } from "@/app/workflow/[workflowId]/contexts/WorkflowContext";
 import { useWorkflowStore } from "@/app/workflow/[workflowId]/stores/workflowStore";
+import { TextOrAudioInput } from "@/components/flow/TextOrAudioInput";
 import { Button } from "@/components/ui/button";
 import { Dialog, DialogContent, DialogFooter, DialogHeader, DialogTitle } from "@/components/ui/dialog";
 import { Input } from "@/components/ui/input";
@ -24,9 +25,12 @@ interface EdgeDetailsDialogProps {

 const EdgeDetailsDialog = ({ open, onOpenChange, data, onSave }: EdgeDetailsDialogProps) => {
    const readOnly = useWorkflowOptional()?.readOnly ?? false;
+    const { recordings } = useWorkflow();
    const [condition, setCondition] = useState(data?.condition ?? '');
    const [label, setLabel] = useState(data?.label ?? '');
    const [transitionSpeech, setTransitionSpeech] = useState(data?.transition_speech ?? '');
+    const [transitionSpeechType, setTransitionSpeechType] = useState<'text' | 'audio'>(data?.transition_speech_type ?? 'text');
+    const [transitionSpeechRecordingId, setTransitionSpeechRecordingId] = useState(data?.transition_speech_recording_id ?? '');

    // Update form state when data changes (e.g., from undo/redo)
    useEffect(() => {
@ -34,13 +38,21 @@ const EdgeDetailsDialog = ({ open, onOpenChange, data, onSave }: EdgeDetailsDial
            setCondition(data?.condition ?? '');
            setLabel(data?.label ?? '');
            setTransitionSpeech(data?.transition_speech ?? '');
+            setTransitionSpeechType(data?.transition_speech_type ?? 'text');
+            setTransitionSpeechRecordingId(data?.transition_speech_recording_id ?? '');
        }
    }, [data, open]);

    const handleSave = useCallback(() => {
-        onSave({ condition: condition, label: label, transition_speech: transitionSpeech || undefined });
+        onSave({
+            condition,
+            label,
+            transition_speech: transitionSpeechType === 'text' ? (transitionSpeech || undefined) : undefined,
+            transition_speech_type: transitionSpeechType,
+            transition_speech_recording_id: transitionSpeechType === 'audio' ? (transitionSpeechRecordingId || undefined) : undefined,
+        });
        onOpenChange(false);
-    }, [condition, label, transitionSpeech, onSave, onOpenChange]);
+    }, [condition, label, transitionSpeech, transitionSpeechType, transitionSpeechRecordingId, onSave, onOpenChange]);

    // Handle Cmd+S / Ctrl+S keyboard shortcut to save
    useEffect(() => {
@ -99,18 +111,28 @@ const EdgeDetailsDialog = ({ open, onOpenChange, data, onSave }: EdgeDetailsDial
                    <div className="grid gap-2">
                        <Label>Transition Speech</Label>
                        <Label className="text-xs text-muted-foreground">
-                            Optional text the assistant will speak right before transitioning to the node.
-                            This text will not be attached in Conversation Context. Use this as simple filler to reduce latency.
+                            Optional text or audio the assistant will play right before transitioning to the node.
+                            This will not be attached in Conversation Context. Use this as simple filler to reduce latency.
                        </Label>
-                        <div className="flex items-start gap-2 rounded-md bg-amber-50 p-2 text-xs text-amber-700 border border-amber-200">
-                            <AlertCircle className="h-3.5 w-3.5 mt-0.5 shrink-0" />
-                            <span>This text is spoken as-is. For multilingual workflows, choose your phrasing carefully.</span>
-                        </div>
-                        <Textarea
-                            value={transitionSpeech}
-                            placeholder="e.g. Let me transfer you to our billing department..."
-                            onChange={(e) => setTransitionSpeech(e.target.value)}
-                        />
+                        <TextOrAudioInput
+                            type={transitionSpeechType}
+                            onTypeChange={setTransitionSpeechType}
+                            recordingId={transitionSpeechRecordingId}
+                            onRecordingIdChange={setTransitionSpeechRecordingId}
+                            recordings={recordings ?? []}
+                        >
+                            <>
+                                <div className="flex items-start gap-2 rounded-md bg-amber-50 p-2 text-xs text-amber-700 border border-amber-200">
+                                    <AlertCircle className="h-3.5 w-3.5 mt-0.5 shrink-0" />
+                                    <span>This text is spoken as-is. For multilingual workflows, choose your phrasing carefully.</span>
+                                </div>
+                                <Textarea
+                                    value={transitionSpeech}
+                                    placeholder="e.g. Let me transfer you to our billing department..."
+                                    onChange={(e) => setTransitionSpeech(e.target.value)}
+                                />
+                            </>
+                        </TextOrAudioInput>
                    </div>
                </div>
                <DialogFooter>
--- a/ui/src/components/flow/nodes/StartCall.tsx
+++ b/ui/src/components/flow/nodes/StartCall.tsx
@ -8,6 +8,7 @@ import type { RecordingResponseSchema } from "@/client/types.gen";
 import { DocumentBadges } from "@/components/flow/DocumentBadges";
 import { DocumentSelector } from "@/components/flow/DocumentSelector";
 import { MentionTextarea } from "@/components/flow/MentionTextarea";
+import { TextOrAudioInput } from "@/components/flow/TextOrAudioInput";
 import { ToolBadges } from "@/components/flow/ToolBadges";
 import { ToolSelector } from "@/components/flow/ToolSelector";
 import { ExtractionVariable, FlowNodeData } from "@/components/flow/types";
@ -26,8 +27,12 @@ import { useNodeHandlers } from "./common/useNodeHandlers";

 interface StartCallEditFormProps {
    nodeData: FlowNodeData;
+    greetingType: 'text' | 'audio';
+    setGreetingType: (value: 'text' | 'audio') => void;
    greeting: string;
    setGreeting: (value: string) => void;
+    greetingRecordingId: string;
+    setGreetingRecordingId: (value: string) => void;
    prompt: string;
    setPrompt: (value: string) => void;
    name: string;
@ -73,7 +78,9 @@ export const StartCall = memo(({ data, selected, id }: StartCallNodeProps) => {
    const { saveWorkflow, tools, documents, recordings } = useWorkflow();

    // Form state
+    const [greetingType, setGreetingType] = useState<'text' | 'audio'>(data.greeting_type ?? "text");
    const [greeting, setGreeting] = useState(data.greeting ?? "");
+    const [greetingRecordingId, setGreetingRecordingId] = useState(data.greeting_recording_id ?? "");
    const [prompt, setPrompt] = useState(data.prompt ?? "");
    const [name, setName] = useState(data.name);
    const [allowInterrupt, setAllowInterrupt] = useState(data.allow_interrupt ?? true);
@ -109,7 +116,9 @@ export const StartCall = memo(({ data, selected, id }: StartCallNodeProps) => {

        handleSaveNodeData({
            ...data,
-            greeting: greeting || undefined,
+            greeting_type: greetingType,
+            greeting: greetingType === 'text' ? (greeting || undefined) : undefined,
+            greeting_recording_id: greetingType === 'audio' ? (greetingRecordingId || undefined) : undefined,
            prompt,
            name,
            allow_interrupt: allowInterrupt,
@ -132,7 +141,9 @@ export const StartCall = memo(({ data, selected, id }: StartCallNodeProps) => {
    // Reset form state when dialog opens
    const handleOpenChange = (newOpen: boolean) => {
        if (newOpen) {
+            setGreetingType(data.greeting_type ?? "text");
            setGreeting(data.greeting ?? "");
+            setGreetingRecordingId(data.greeting_recording_id ?? "");
            setPrompt(data.prompt ?? "");
            setName(data.name);
            setAllowInterrupt(data.allow_interrupt ?? true);
@ -154,7 +165,9 @@ export const StartCall = memo(({ data, selected, id }: StartCallNodeProps) => {
    // Update form state when data changes (e.g., from undo/redo)
    useEffect(() => {
        if (open) {
+            setGreetingType(data.greeting_type ?? "text");
            setGreeting(data.greeting ?? "");
+            setGreetingRecordingId(data.greeting_recording_id ?? "");
            setPrompt(data.prompt ?? "");
            setName(data.name);
            setAllowInterrupt(data.allow_interrupt ?? true);
@ -247,8 +260,12 @@ export const StartCall = memo(({ data, selected, id }: StartCallNodeProps) => {
                {open && (
                    <StartCallEditForm
                        nodeData={data}
+                        greetingType={greetingType}
+                        setGreetingType={setGreetingType}
                        greeting={greeting}
                        setGreeting={setGreeting}
+                        greetingRecordingId={greetingRecordingId}
+                        setGreetingRecordingId={setGreetingRecordingId}
                        prompt={prompt}
                        setPrompt={setPrompt}
                        name={name}
@ -288,8 +305,12 @@ export const StartCall = memo(({ data, selected, id }: StartCallNodeProps) => {
 });

 const StartCallEditForm = ({
+    greetingType,
+    setGreetingType,
    greeting,
    setGreeting,
+    greetingRecordingId,
+    setGreetingRecordingId,
    prompt,
    setPrompt,
    name,
@ -362,15 +383,22 @@ const StartCallEditForm = ({

            <Label>Greeting</Label>
            <Label className="text-xs text-muted-foreground">
-                Optional greeting message played via TTS when the call starts. If set, this will be spoken directly instead of generating a response from the LLM. Supports template variables like {"{{variable_name}}"}.
+                Optional greeting played when the call starts. Choose between a text message (spoken via TTS) or a pre-recorded audio file.
            </Label>
-            <MentionTextarea
-                value={greeting}
-                onChange={setGreeting}
-                className="min-h-[60px] max-h-[200px] resize-none overflow-y-auto"
-                placeholder="e.g. Hello {{first_name}}, this is Sarah calling from Acme Corp."
+            <TextOrAudioInput
+                type={greetingType}
+                onTypeChange={setGreetingType}
+                recordingId={greetingRecordingId}
+                onRecordingIdChange={setGreetingRecordingId}
                recordings={recordings}
-            />
+            >
+                <Textarea
+                    value={greeting}
+                    onChange={(e) => setGreeting(e.target.value)}
+                    className="min-h-[60px] max-h-[200px] resize-none overflow-y-auto"
+                    placeholder="e.g. Hello {{first_name}}, this is Sarah calling from Acme Corp."
+                />
+            </TextOrAudioInput>

            <Label>Prompt</Label>
            <Label className="text-xs text-muted-foreground">
--- a/ui/src/components/flow/types.ts
+++ b/ui/src/components/flow/types.ts
@ -24,6 +24,8 @@ export type FlowNodeData = {
    extraction_variables?: ExtractionVariable[];
    add_global_prompt?: boolean;
    greeting?: string;
+    greeting_type?: 'text' | 'audio';
+    greeting_recording_id?: string;
    wait_for_user_greeting?: boolean;
    detect_voicemail?: boolean;
    delayed_start?: boolean;
@ -79,6 +81,8 @@ export type FlowEdgeData = {
    condition: string;
    label: string;
    transition_speech?: string;
+    transition_speech_type?: 'text' | 'audio';
+    transition_speech_recording_id?: string;
    invalid?: boolean;
    validationMessage?: string | null;
 }
--- a/ui/src/components/layout/AppSidebar.tsx
+++ b/ui/src/components/layout/AppSidebar.tsx
@ -2,6 +2,7 @@

 import type { Team } from "@stackframe/stack";
 import {
+  AudioLines,
  Brain,
  ChevronLeft,
  ChevronRight,
@ -135,6 +136,11 @@ export function AppSidebar() {
          url: "/files",
          icon: Database,
        },
+        {
+          title: "Recordings",
+          url: "/recordings",
+          icon: AudioLines,
+        },
        // {
        //   title: "Integrations",
        //   url: "/integrations",