feat: add recording audio option in tool and node transitions (#232)

* feat: allow uploading recording as part of node transition

* feat: allow recordings in tool transitions

* chore: fix tests
This commit is contained in:
Abhishek 2026-04-10 17:53:42 +05:30 committed by GitHub
parent 3f19a16e7f
commit 7c245051d2
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
54 changed files with 3575 additions and 640 deletions

View file

@ -0,0 +1,212 @@
import { Check, ChevronDown, Pause, Play, Search } from "lucide-react";
import { useMemo, useState } from "react";
import type { RecordingResponseSchema } from "@/client/types.gen";
import { Button } from "@/components/ui/button";
import { Input } from "@/components/ui/input";
import { Label } from "@/components/ui/label";
import { Popover, PopoverContentInline, PopoverTrigger } from "@/components/ui/popover";
import { RadioGroup, RadioGroupItem } from "@/components/ui/radio-group";
import { useAudioPlayback } from "@/hooks/useAudioPlayback";
import { cn } from "@/lib/utils";
interface TextOrAudioInputProps {
type: 'text' | 'audio';
onTypeChange: (type: 'text' | 'audio') => void;
recordingId: string;
onRecordingIdChange: (id: string) => void;
recordings?: RecordingResponseSchema[];
/** Rendered when type === 'text' */
children: React.ReactNode;
}
export function TextOrAudioInput({
type,
onTypeChange,
recordingId,
onRecordingIdChange,
recordings = [],
children,
}: TextOrAudioInputProps) {
return (
<>
<RadioGroup
value={type}
onValueChange={(value) => onTypeChange(value as 'text' | 'audio')}
className="flex items-center gap-4"
>
<div className="flex items-center gap-2">
<RadioGroupItem value="text" id="toa-text" />
<Label htmlFor="toa-text" className="font-normal cursor-pointer">Text</Label>
</div>
<div className="flex items-center gap-2">
<RadioGroupItem value="audio" id="toa-audio" />
<Label htmlFor="toa-audio" className="font-normal cursor-pointer">Audio</Label>
</div>
</RadioGroup>
{type === 'text' ? (
children
) : (
<RecordingSelect
value={recordingId}
onChange={onRecordingIdChange}
recordings={recordings}
/>
)}
</>
);
}
interface RecordingSelectProps {
value: string;
onChange: (id: string) => void;
recordings: RecordingResponseSchema[];
}
/**
* Dropdown to select a pre-recorded audio file.
* Re-exported so callers that only need the dropdown (e.g. tool configs with
* their own none/custom/audio radio) can use it directly.
*/
export function RecordingSelect({ value, onChange, recordings }: RecordingSelectProps) {
const [open, setOpen] = useState(false);
const [search, setSearch] = useState("");
const { playingId, toggle, stop } = useAudioPlayback();
const selected = recordings.find((r) => String(r.id) === value);
const filtered = useMemo(() => {
if (!search) return recordings;
const q = search.toLowerCase();
return recordings.filter((r) =>
r.recording_id.toLowerCase().includes(q) ||
r.transcript.toLowerCase().includes(q) ||
((r.metadata?.original_filename as string) || "").toLowerCase().includes(q)
);
}, [recordings, search]);
const handleSelect = (rec: RecordingResponseSchema) => {
stop();
onChange(String(rec.id));
setOpen(false);
};
const handlePlay = async (e: React.MouseEvent, rec: RecordingResponseSchema) => {
e.stopPropagation();
try {
await toggle(rec.recording_id, rec.storage_key, rec.storage_backend);
} catch {
// Ignore playback errors
}
};
return (
<div className="space-y-2">
<Label className="text-xs text-muted-foreground">
Select a pre-recorded audio file to play.
</Label>
<Popover modal open={open} onOpenChange={(v) => { if (!v) { stop(); setSearch(""); } setOpen(v); }}>
<PopoverTrigger asChild>
<Button
variant="outline"
role="combobox"
aria-expanded={open}
className="w-full justify-between h-auto min-h-9 font-normal"
>
{selected ? (
<span className="flex items-center gap-2 text-left">
<code className="text-xs bg-muted px-1 py-0.5 rounded font-mono shrink-0">
{selected.recording_id}
</code>
<span className="text-sm">
{selected.transcript.length > 75
? `${selected.transcript.slice(0, 75)}`
: selected.transcript}
</span>
</span>
) : (
<span className="text-muted-foreground">Select a recording</span>
)}
<ChevronDown className="ml-2 h-4 w-4 shrink-0 opacity-50" />
</Button>
</PopoverTrigger>
<PopoverContentInline
className="w-[var(--radix-popover-trigger-width)] p-0"
align="start"
>
{recordings.length === 0 ? (
<div className="p-3 text-sm text-muted-foreground text-center">
No recordings available
</div>
) : (
<div>
<div className="p-2 border-b">
<div className="relative">
<Search className="absolute left-2.5 top-1/2 -translate-y-1/2 h-3.5 w-3.5 text-muted-foreground" />
<Input
placeholder="Search by ID, transcript, or filename..."
value={search}
onChange={(e) => setSearch(e.target.value)}
className="h-8 pl-8 text-sm"
autoFocus
/>
</div>
</div>
<div className="max-h-56 overflow-y-auto">
{filtered.length === 0 ? (
<div className="p-3 text-sm text-muted-foreground text-center">
No recordings match &ldquo;{search}&rdquo;
</div>
) : filtered.map((r) => {
const filename = (r.metadata?.original_filename as string) || "";
const isSelected = String(r.id) === value;
const isPlaying = playingId === r.recording_id;
return (
<div
key={r.id}
className={cn(
"flex items-center gap-2 px-3 py-2 cursor-pointer hover:bg-accent transition-colors",
isSelected && "bg-accent"
)}
onClick={() => handleSelect(r)}
>
<Check className={cn(
"h-4 w-4 shrink-0",
isSelected ? "opacity-100" : "opacity-0"
)} />
<code className="text-xs bg-muted px-1 py-0.5 rounded font-mono shrink-0">
{r.recording_id}
</code>
{filename && (
<span className="text-xs text-muted-foreground shrink-0 max-w-[100px] truncate">
{filename}
</span>
)}
<span className="text-xs text-muted-foreground bg-muted/50 px-1.5 py-0.5 rounded truncate flex-1 min-w-0">
{r.transcript}
</span>
<Button
type="button"
variant="ghost"
size="sm"
className="h-7 w-7 p-0 shrink-0"
onClick={(e) => handlePlay(e, r)}
>
{isPlaying ? (
<Pause className="h-3.5 w-3.5" />
) : (
<Play className="h-3.5 w-3.5" />
)}
</Button>
</div>
);
})}
</div>
</div>
)}
</PopoverContentInline>
</Popover>
</div>
);
}

View file

@ -4,6 +4,7 @@ import { useCallback, useEffect, useState } from 'react';
import { useWorkflow, useWorkflowOptional } from "@/app/workflow/[workflowId]/contexts/WorkflowContext";
import { useWorkflowStore } from "@/app/workflow/[workflowId]/stores/workflowStore";
import { TextOrAudioInput } from "@/components/flow/TextOrAudioInput";
import { Button } from "@/components/ui/button";
import { Dialog, DialogContent, DialogFooter, DialogHeader, DialogTitle } from "@/components/ui/dialog";
import { Input } from "@/components/ui/input";
@ -24,9 +25,12 @@ interface EdgeDetailsDialogProps {
const EdgeDetailsDialog = ({ open, onOpenChange, data, onSave }: EdgeDetailsDialogProps) => {
const readOnly = useWorkflowOptional()?.readOnly ?? false;
const { recordings } = useWorkflow();
const [condition, setCondition] = useState(data?.condition ?? '');
const [label, setLabel] = useState(data?.label ?? '');
const [transitionSpeech, setTransitionSpeech] = useState(data?.transition_speech ?? '');
const [transitionSpeechType, setTransitionSpeechType] = useState<'text' | 'audio'>(data?.transition_speech_type ?? 'text');
const [transitionSpeechRecordingId, setTransitionSpeechRecordingId] = useState(data?.transition_speech_recording_id ?? '');
// Update form state when data changes (e.g., from undo/redo)
useEffect(() => {
@ -34,13 +38,21 @@ const EdgeDetailsDialog = ({ open, onOpenChange, data, onSave }: EdgeDetailsDial
setCondition(data?.condition ?? '');
setLabel(data?.label ?? '');
setTransitionSpeech(data?.transition_speech ?? '');
setTransitionSpeechType(data?.transition_speech_type ?? 'text');
setTransitionSpeechRecordingId(data?.transition_speech_recording_id ?? '');
}
}, [data, open]);
const handleSave = useCallback(() => {
onSave({ condition: condition, label: label, transition_speech: transitionSpeech || undefined });
onSave({
condition,
label,
transition_speech: transitionSpeechType === 'text' ? (transitionSpeech || undefined) : undefined,
transition_speech_type: transitionSpeechType,
transition_speech_recording_id: transitionSpeechType === 'audio' ? (transitionSpeechRecordingId || undefined) : undefined,
});
onOpenChange(false);
}, [condition, label, transitionSpeech, onSave, onOpenChange]);
}, [condition, label, transitionSpeech, transitionSpeechType, transitionSpeechRecordingId, onSave, onOpenChange]);
// Handle Cmd+S / Ctrl+S keyboard shortcut to save
useEffect(() => {
@ -60,7 +72,7 @@ const EdgeDetailsDialog = ({ open, onOpenChange, data, onSave }: EdgeDetailsDial
return (
<Dialog open={open} onOpenChange={onOpenChange}>
<DialogContent>
<DialogContent className="max-h-[85vh] flex flex-col">
<DialogHeader>
<DialogTitle>Edit Condition</DialogTitle>
{data?.invalid && data.validationMessage && (
@ -70,7 +82,7 @@ const EdgeDetailsDialog = ({ open, onOpenChange, data, onSave }: EdgeDetailsDial
</div>
)}
</DialogHeader>
<div className="grid gap-4 py-4">
<div className="grid gap-4 py-4 overflow-y-auto">
<div className="grid gap-2">
<Label>Condition Label</Label>
<Label className="text-xs text-muted-foreground">
@ -99,18 +111,28 @@ const EdgeDetailsDialog = ({ open, onOpenChange, data, onSave }: EdgeDetailsDial
<div className="grid gap-2">
<Label>Transition Speech</Label>
<Label className="text-xs text-muted-foreground">
Optional text the assistant will speak right before transitioning to the node.
This text will not be attached in Conversation Context. Use this as simple filler to reduce latency.
Optional text or audio the assistant will play right before transitioning to the node.
This will not be attached in Conversation Context. Use this as simple filler to reduce latency.
</Label>
<div className="flex items-start gap-2 rounded-md bg-amber-50 p-2 text-xs text-amber-700 border border-amber-200">
<AlertCircle className="h-3.5 w-3.5 mt-0.5 shrink-0" />
<span>This text is spoken as-is. For multilingual workflows, choose your phrasing carefully.</span>
</div>
<Textarea
value={transitionSpeech}
placeholder="e.g. Let me transfer you to our billing department..."
onChange={(e) => setTransitionSpeech(e.target.value)}
/>
<TextOrAudioInput
type={transitionSpeechType}
onTypeChange={setTransitionSpeechType}
recordingId={transitionSpeechRecordingId}
onRecordingIdChange={setTransitionSpeechRecordingId}
recordings={recordings ?? []}
>
<>
<div className="flex items-start gap-2 rounded-md bg-amber-50 p-2 text-xs text-amber-700 border border-amber-200">
<AlertCircle className="h-3.5 w-3.5 mt-0.5 shrink-0" />
<span>This text is spoken as-is. For multilingual workflows, choose your phrasing carefully.</span>
</div>
<Textarea
value={transitionSpeech}
placeholder="e.g. Let me transfer you to our billing department..."
onChange={(e) => setTransitionSpeech(e.target.value)}
/>
</>
</TextOrAudioInput>
</div>
</div>
<DialogFooter>

View file

@ -8,6 +8,7 @@ import type { RecordingResponseSchema } from "@/client/types.gen";
import { DocumentBadges } from "@/components/flow/DocumentBadges";
import { DocumentSelector } from "@/components/flow/DocumentSelector";
import { MentionTextarea } from "@/components/flow/MentionTextarea";
import { TextOrAudioInput } from "@/components/flow/TextOrAudioInput";
import { ToolBadges } from "@/components/flow/ToolBadges";
import { ToolSelector } from "@/components/flow/ToolSelector";
import { ExtractionVariable, FlowNodeData } from "@/components/flow/types";
@ -26,8 +27,12 @@ import { useNodeHandlers } from "./common/useNodeHandlers";
interface StartCallEditFormProps {
nodeData: FlowNodeData;
greetingType: 'text' | 'audio';
setGreetingType: (value: 'text' | 'audio') => void;
greeting: string;
setGreeting: (value: string) => void;
greetingRecordingId: string;
setGreetingRecordingId: (value: string) => void;
prompt: string;
setPrompt: (value: string) => void;
name: string;
@ -73,7 +78,9 @@ export const StartCall = memo(({ data, selected, id }: StartCallNodeProps) => {
const { saveWorkflow, tools, documents, recordings } = useWorkflow();
// Form state
const [greetingType, setGreetingType] = useState<'text' | 'audio'>(data.greeting_type ?? "text");
const [greeting, setGreeting] = useState(data.greeting ?? "");
const [greetingRecordingId, setGreetingRecordingId] = useState(data.greeting_recording_id ?? "");
const [prompt, setPrompt] = useState(data.prompt ?? "");
const [name, setName] = useState(data.name);
const [allowInterrupt, setAllowInterrupt] = useState(data.allow_interrupt ?? true);
@ -109,7 +116,9 @@ export const StartCall = memo(({ data, selected, id }: StartCallNodeProps) => {
handleSaveNodeData({
...data,
greeting: greeting || undefined,
greeting_type: greetingType,
greeting: greetingType === 'text' ? (greeting || undefined) : undefined,
greeting_recording_id: greetingType === 'audio' ? (greetingRecordingId || undefined) : undefined,
prompt,
name,
allow_interrupt: allowInterrupt,
@ -132,7 +141,9 @@ export const StartCall = memo(({ data, selected, id }: StartCallNodeProps) => {
// Reset form state when dialog opens
const handleOpenChange = (newOpen: boolean) => {
if (newOpen) {
setGreetingType(data.greeting_type ?? "text");
setGreeting(data.greeting ?? "");
setGreetingRecordingId(data.greeting_recording_id ?? "");
setPrompt(data.prompt ?? "");
setName(data.name);
setAllowInterrupt(data.allow_interrupt ?? true);
@ -154,7 +165,9 @@ export const StartCall = memo(({ data, selected, id }: StartCallNodeProps) => {
// Update form state when data changes (e.g., from undo/redo)
useEffect(() => {
if (open) {
setGreetingType(data.greeting_type ?? "text");
setGreeting(data.greeting ?? "");
setGreetingRecordingId(data.greeting_recording_id ?? "");
setPrompt(data.prompt ?? "");
setName(data.name);
setAllowInterrupt(data.allow_interrupt ?? true);
@ -247,8 +260,12 @@ export const StartCall = memo(({ data, selected, id }: StartCallNodeProps) => {
{open && (
<StartCallEditForm
nodeData={data}
greetingType={greetingType}
setGreetingType={setGreetingType}
greeting={greeting}
setGreeting={setGreeting}
greetingRecordingId={greetingRecordingId}
setGreetingRecordingId={setGreetingRecordingId}
prompt={prompt}
setPrompt={setPrompt}
name={name}
@ -288,8 +305,12 @@ export const StartCall = memo(({ data, selected, id }: StartCallNodeProps) => {
});
const StartCallEditForm = ({
greetingType,
setGreetingType,
greeting,
setGreeting,
greetingRecordingId,
setGreetingRecordingId,
prompt,
setPrompt,
name,
@ -362,15 +383,22 @@ const StartCallEditForm = ({
<Label>Greeting</Label>
<Label className="text-xs text-muted-foreground">
Optional greeting message played via TTS when the call starts. If set, this will be spoken directly instead of generating a response from the LLM. Supports template variables like {"{{variable_name}}"}.
Optional greeting played when the call starts. Choose between a text message (spoken via TTS) or a pre-recorded audio file.
</Label>
<MentionTextarea
value={greeting}
onChange={setGreeting}
className="min-h-[60px] max-h-[200px] resize-none overflow-y-auto"
placeholder="e.g. Hello {{first_name}}, this is Sarah calling from Acme Corp."
<TextOrAudioInput
type={greetingType}
onTypeChange={setGreetingType}
recordingId={greetingRecordingId}
onRecordingIdChange={setGreetingRecordingId}
recordings={recordings}
/>
>
<Textarea
value={greeting}
onChange={(e) => setGreeting(e.target.value)}
className="min-h-[60px] max-h-[200px] resize-none overflow-y-auto"
placeholder="e.g. Hello {{first_name}}, this is Sarah calling from Acme Corp."
/>
</TextOrAudioInput>
<Label>Prompt</Label>
<Label className="text-xs text-muted-foreground">

View file

@ -24,6 +24,8 @@ export type FlowNodeData = {
extraction_variables?: ExtractionVariable[];
add_global_prompt?: boolean;
greeting?: string;
greeting_type?: 'text' | 'audio';
greeting_recording_id?: string;
wait_for_user_greeting?: boolean;
detect_voicemail?: boolean;
delayed_start?: boolean;
@ -79,6 +81,8 @@ export type FlowEdgeData = {
condition: string;
label: string;
transition_speech?: string;
transition_speech_type?: 'text' | 'audio';
transition_speech_recording_id?: string;
invalid?: boolean;
validationMessage?: string | null;
}

View file

@ -2,6 +2,7 @@
import type { Team } from "@stackframe/stack";
import {
AudioLines,
Brain,
ChevronLeft,
ChevronRight,
@ -135,6 +136,11 @@ export function AppSidebar() {
url: "/files",
icon: Database,
},
{
title: "Recordings",
url: "/recordings",
icon: AudioLines,
},
// {
// title: "Integrations",
// url: "/integrations",

View file

@ -56,6 +56,23 @@ function DialogContent({
<DialogOverlay />
<DialogPrimitive.Content
onOpenAutoFocus={e => e.preventDefault()}
onCloseAutoFocus={() => {
document.body.style.pointerEvents = "";
}}
onPointerDownOutside={(e) => {
// Prevent the Dialog from closing when the user clicks inside a
// portaled Radix Popover/DropdownMenu rendered on top of this Dialog.
const target = e.target as HTMLElement;
if (target.closest('[data-radix-popper-content-wrapper]')) {
e.preventDefault();
}
}}
onInteractOutside={(e) => {
const target = e.target as HTMLElement;
if (target.closest('[data-radix-popper-content-wrapper]')) {
e.preventDefault();
}
}}
data-slot="dialog-content"
className={cn(
"bg-background data-[state=open]:animate-in data-[state=closed]:animate-out data-[state=closed]:fade-out-0 data-[state=open]:fade-in-0 data-[state=closed]:zoom-out-95 data-[state=open]:zoom-in-95 fixed top-[50%] left-[50%] z-50 grid w-full max-w-[calc(100%-2rem)] translate-x-[-50%] translate-y-[-50%] gap-4 rounded-lg border p-6 shadow-lg duration-200 sm:max-w-lg",

View file

@ -17,6 +17,9 @@ function PopoverTrigger({
return <PopoverPrimitive.Trigger data-slot="popover-trigger" {...props} />
}
const popoverContentClass =
"bg-popover text-popover-foreground data-[state=open]:animate-in data-[state=closed]:animate-out data-[state=closed]:fade-out-0 data-[state=open]:fade-in-0 data-[state=closed]:zoom-out-95 data-[state=open]:zoom-in-95 data-[side=bottom]:slide-in-from-top-2 data-[side=left]:slide-in-from-right-2 data-[side=right]:slide-in-from-left-2 data-[side=top]:slide-in-from-bottom-2 z-50 w-72 origin-(--radix-popover-content-transform-origin) rounded-md border p-4 shadow-md outline-hidden"
function PopoverContent({
className,
align = "center",
@ -29,20 +32,38 @@ function PopoverContent({
data-slot="popover-content"
align={align}
sideOffset={sideOffset}
className={cn(
"bg-popover text-popover-foreground data-[state=open]:animate-in data-[state=closed]:animate-out data-[state=closed]:fade-out-0 data-[state=open]:fade-in-0 data-[state=closed]:zoom-out-95 data-[state=open]:zoom-in-95 data-[side=bottom]:slide-in-from-top-2 data-[side=left]:slide-in-from-right-2 data-[side=right]:slide-in-from-left-2 data-[side=top]:slide-in-from-bottom-2 z-50 w-72 origin-(--radix-popover-content-transform-origin) rounded-md border p-4 shadow-md outline-hidden",
className
)}
className={cn(popoverContentClass, className)}
{...props}
/>
</PopoverPrimitive.Portal>
)
}
/**
* PopoverContent without a Portal wrapper. Renders inline in the DOM tree,
* which avoids focus-trap conflicts when used inside a Dialog.
*/
function PopoverContentInline({
className,
align = "center",
sideOffset = 4,
...props
}: React.ComponentProps<typeof PopoverPrimitive.Content>) {
return (
<PopoverPrimitive.Content
data-slot="popover-content"
align={align}
sideOffset={sideOffset}
className={cn(popoverContentClass, className)}
{...props}
/>
)
}
function PopoverAnchor({
...props
}: React.ComponentProps<typeof PopoverPrimitive.Anchor>) {
return <PopoverPrimitive.Anchor data-slot="popover-anchor" {...props} />
}
export { Popover, PopoverAnchor,PopoverContent, PopoverTrigger }
export { Popover, PopoverAnchor, PopoverContent, PopoverContentInline, PopoverTrigger }