feat: allow recording audio in workflow builder

2026-06-16 08:25:18 +02:00 · 2026-03-25 15:01:39 +05:30 · 2026-03-25 15:01:39 +05:30 · 2fa4191d9b
commit 2fa4191d9b
parent ac0731a374
22 changed files with 700 additions and 246 deletions
--- a/ui/src/app/workflow/[workflowId]/RenderWorkflow.tsx
+++ b/ui/src/app/workflow/[workflowId]/RenderWorkflow.tsx
@ -14,6 +14,7 @@ import type { DocumentResponseSchema, RecordingResponseSchema, ToolResponse } fr
 import { FlowEdge, FlowNode, NodeType } from "@/components/flow/types";
 import { Button } from '@/components/ui/button';
 import { Tooltip, TooltipContent, TooltipProvider, TooltipTrigger } from '@/components/ui/tooltip';
+import { useUserConfig } from '@/context/UserConfigContext';
 import { WorkflowConfigurations } from '@/types/workflow-configurations';

 import AddNodePanel from "../../../components/flow/AddNodePanel";
@ -64,6 +65,11 @@ interface RenderWorkflowProps {
 }

 function RenderWorkflow({ initialWorkflowName, workflowId, initialFlow, initialTemplateContextVariables, initialWorkflowConfigurations, user }: RenderWorkflowProps) {
+    const { userConfig } = useUserConfig();
+    const ttsProvider = (userConfig?.tts?.provider as string) ?? "";
+    const ttsModel = (userConfig?.tts?.model as string) ?? "";
+    const ttsVoiceId = (userConfig?.tts?.voice as string) ?? "";
+
    const [isContextVarsDialogOpen, setIsContextVarsDialogOpen] = useState(false);
    const [isConfigurationsDialogOpen, setIsConfigurationsDialogOpen] = useState(false);
    const [isDictionaryDialogOpen, setIsDictionaryDialogOpen] = useState(false);
@ -125,10 +131,15 @@ function RenderWorkflow({ initialWorkflowName, workflowId, initialFlow, initialT
                    setTools(toolsResponse.data);
                }

-                // Fetch recordings for this workflow
+                // Fetch recordings for this workflow filtered by active TTS config
                try {
                    const recordingsResponse = await listRecordingsApiV1WorkflowRecordingsGet({
-                        query: { workflow_id: workflowId },
+                        query: {
+                            workflow_id: workflowId,
+                            tts_provider: ttsProvider || undefined,
+                            tts_model: ttsModel || undefined,
+                            tts_voice_id: ttsVoiceId || undefined,
+                        },
                    });
                    if (recordingsResponse.data) {
                        setRecordings(recordingsResponse.data.recordings);
@ -142,7 +153,7 @@ function RenderWorkflow({ initialWorkflowName, workflowId, initialFlow, initialT
        };

        fetchData();
-    }, [workflowId]);
+    }, [workflowId, ttsProvider, ttsModel, ttsVoiceId]);

    // Memoize defaultEdgeOptions to prevent unnecessary re-renders
    const defaultEdgeOptions = useMemo(() => ({
--- a/ui/src/app/workflow/[workflowId]/components/RecordingsDialog.tsx
+++ b/ui/src/app/workflow/[workflowId]/components/RecordingsDialog.tsx
@ -1,4 +1,4 @@
-import { Loader2, Trash2Icon, Upload } from "lucide-react";
+import { Loader2, Mic, Square, Trash2Icon, Upload } from "lucide-react";
 import { useCallback, useEffect, useRef, useState } from "react";

 import {
@ -6,6 +6,7 @@ import {
    deleteRecordingApiV1WorkflowRecordingsRecordingIdDelete,
    getUploadUrlApiV1WorkflowRecordingsUploadUrlPost,
    listRecordingsApiV1WorkflowRecordingsGet,
+    transcribeAudioApiV1WorkflowRecordingsTranscribePost,
 } from "@/client";
 import type { RecordingResponseSchema } from "@/client/types.gen";
 import { Button } from "@/components/ui/button";
@ -18,6 +19,15 @@ import {
 } from "@/components/ui/dialog";
 import { Input } from "@/components/ui/input";
 import { Label } from "@/components/ui/label";
+import {
+    Select,
+    SelectContent,
+    SelectItem,
+    SelectTrigger,
+    SelectValue,
+} from "@/components/ui/select";
+import { Textarea } from "@/components/ui/textarea";
+import { LANGUAGE_DISPLAY_NAMES } from "@/constants/languages";
 import { useUserConfig } from "@/context/UserConfigContext";

 interface RecordingsDialogProps {
@ -29,6 +39,8 @@ interface RecordingsDialogProps {

 const MAX_FILE_SIZE = 5 * 1024 * 1024; // 5MB

+type RecordingStep = "idle" | "naming" | "recording" | "transcribing";
+
 export const RecordingsDialog = ({
    open,
    onOpenChange,
@ -42,7 +54,16 @@ export const RecordingsDialog = ({
    const [transcript, setTranscript] = useState("");
    const [selectedFile, setSelectedFile] = useState<File | null>(null);
    const [error, setError] = useState<string | null>(null);
+    const [language, setLanguage] = useState("multi");
+    const [recordingStep, setRecordingStep] = useState<RecordingStep>("idle");
+    const [recordingFilename, setRecordingFilename] = useState("");
+    const [recordingDuration, setRecordingDuration] = useState(0);
+    const mediaRecorderRef = useRef<MediaRecorder | null>(null);
+    const audioChunksRef = useRef<Blob[]>([]);
+    const recordingTimerRef = useRef<ReturnType<typeof setInterval> | null>(null);
    const fileInputRef = useRef<HTMLInputElement>(null);
+    const languageRef = useRef(language);
+    languageRef.current = language;

    const ttsProvider = (userConfig?.tts?.provider as string) ?? "";
    const ttsModel = (userConfig?.tts?.model as string) ?? "";
@ -70,14 +91,119 @@ export const RecordingsDialog = ({
        }
    }, [workflowId, ttsProvider, ttsModel, ttsVoiceId, onRecordingsChange]);

+    const stopRecordingTimer = useCallback(() => {
+        if (recordingTimerRef.current) {
+            clearInterval(recordingTimerRef.current);
+            recordingTimerRef.current = null;
+        }
+    }, []);
+
+    const stopRecording = useCallback(() => {
+        if (mediaRecorderRef.current && mediaRecorderRef.current.state !== "inactive") {
+            mediaRecorderRef.current.stop();
+        }
+    }, []);
+
+    const resetRecordingState = useCallback(() => {
+        setRecordingStep("idle");
+        setRecordingFilename("");
+        setRecordingDuration(0);
+    }, []);
+
    useEffect(() => {
        if (open) {
            fetchRecordings();
            setError(null);
            setTranscript("");
            setSelectedFile(null);
+            setLanguage("multi");
+            resetRecordingState();
        }
-    }, [open, fetchRecordings]);
+    }, [open, fetchRecordings, resetRecordingState]);
+
+    useEffect(() => {
+        if (!open) {
+            stopRecording();
+            stopRecordingTimer();
+        }
+    }, [open, stopRecording, stopRecordingTimer]);
+
+    const transcribeFile = async (file: File) => {
+        setRecordingStep("transcribing");
+        try {
+            const currentLang = languageRef.current;
+            const result = await transcribeAudioApiV1WorkflowRecordingsTranscribePost({
+                body: { file, language: currentLang },
+            });
+            const data = result.data as Record<string, unknown> | undefined;
+            if (data?.transcript) {
+                setTranscript(data.transcript as string);
+            }
+        } catch {
+            // Transcription failed — user can still type manually
+            setError("Auto-transcription failed. You can type the transcript manually.");
+        } finally {
+            setRecordingStep("idle");
+        }
+    };
+
+    const startRecording = async () => {
+        try {
+            const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
+            const mediaRecorder = new MediaRecorder(stream);
+            mediaRecorderRef.current = mediaRecorder;
+            audioChunksRef.current = [];
+
+            mediaRecorder.ondataavailable = (e) => {
+                if (e.data.size > 0) audioChunksRef.current.push(e.data);
+            };
+
+            const filename = recordingFilename.trim() || "recording";
+            mediaRecorder.onstop = () => {
+                stream.getTracks().forEach((t) => t.stop());
+                stopRecordingTimer();
+
+                const blob = new Blob(audioChunksRef.current, { type: mediaRecorder.mimeType });
+                if (blob.size > MAX_FILE_SIZE) {
+                    setError(`Recording (${(blob.size / (1024 * 1024)).toFixed(1)}MB) exceeds the maximum allowed size of 5MB.`);
+                    resetRecordingState();
+                    return;
+                }
+                const ext = mediaRecorder.mimeType.includes("webm") ? "webm" : "mp4";
+                const file = new File([blob], `${filename}.${ext}`, { type: mediaRecorder.mimeType });
+                setSelectedFile(file);
+                setError(null);
+                transcribeFile(file);
+            };
+
+            mediaRecorder.start();
+            setRecordingStep("recording");
+            setRecordingDuration(0);
+            setError(null);
+            recordingTimerRef.current = setInterval(() => {
+                setRecordingDuration((d) => d + 1);
+            }, 1000);
+        } catch {
+            setError("Microphone access denied. Please allow microphone permissions.");
+            resetRecordingState();
+        }
+    };
+
+    const handleStopRecording = () => {
+        stopRecording();
+    };
+
+    const handleFileSelect = (file: File | null) => {
+        if (file && file.size > MAX_FILE_SIZE) {
+            setError(`File size (${(file.size / (1024 * 1024)).toFixed(1)}MB) exceeds the maximum allowed size of 5MB.`);
+            setSelectedFile(null);
+            if (fileInputRef.current) fileInputRef.current.value = "";
+            return;
+        }
+        setError(null);
+        setSelectedFile(file);
+        if (file) transcribeFile(file);
+    };

    const handleUpload = async () => {
        if (!selectedFile || !transcript.trim()) return;
@ -137,6 +263,7 @@ export const RecordingsDialog = ({
                        original_filename: selectedFile.name,
                        file_size_bytes: selectedFile.size,
                        mime_type: selectedFile.type,
+                        language,
                    },
                },
            });
@ -144,6 +271,8 @@ export const RecordingsDialog = ({
            // Reset form and refresh list
            setTranscript("");
            setSelectedFile(null);
+            setLanguage("multi");
+            resetRecordingState();
            if (fileInputRef.current) fileInputRef.current.value = "";
            await fetchRecordings();
        } catch (err) {
@ -166,13 +295,17 @@ export const RecordingsDialog = ({
        }
    };

+    const isRecording = recordingStep === "recording";
+    const isTranscribing = recordingStep === "transcribing";
+    const isBusy = uploading || isRecording || isTranscribing;
+
    return (
        <Dialog open={open} onOpenChange={onOpenChange}>
            <DialogContent className="max-w-lg max-h-[80vh] overflow-y-auto">
                <DialogHeader>
                    <DialogTitle>Workflow Recordings</DialogTitle>
                    <DialogDescription>
-                        Upload audio recordings for hybrid prompts. Recordings are
+                        Upload or record audio for hybrid prompts. Recordings are
                        scoped to your current TTS configuration. Use{" "}
                        <code className="text-xs bg-muted px-1 rounded">@</code> in
                        prompt fields to insert them.
@ -211,61 +344,158 @@ export const RecordingsDialog = ({

                {/* Upload Section */}
                <div className="space-y-3 border rounded-md p-3">
-                    <Label className="text-sm font-medium">Upload New Recording</Label>
+                    <Label className="text-sm font-medium">Add New Recording</Label>
+
+                    {/* Audio source: file picker or record */}
                    <div>
                        <Label className="text-xs text-muted-foreground">
                            Audio File
                        </Label>
-                        <input
-                            ref={fileInputRef}
-                            type="file"
-                            accept="audio/*"
-                            onChange={(e) => {
-                                const file = e.target.files?.[0] ?? null;
-                                if (file && file.size > MAX_FILE_SIZE) {
-                                    setError(
-                                        `File size (${(file.size / (1024 * 1024)).toFixed(1)}MB) exceeds the maximum allowed size of 5MB.`
-                                    );
-                                    setSelectedFile(null);
-                                    if (fileInputRef.current) fileInputRef.current.value = "";
-                                    return;
-                                }
-                                setError(null);
-                                setSelectedFile(file);
-                            }}
-                            className="hidden"
-                        />
-                        <Button
-                            type="button"
-                            variant="outline"
-                            size="sm"
-                            className="w-full justify-start text-sm font-normal"
-                            onClick={() => fileInputRef.current?.click()}
-                        >
-                            <Upload className="w-4 h-4 mr-2 shrink-0" />
-                            {selectedFile ? (
-                                <span className="truncate">
-                                    {selectedFile.name} ({(selectedFile.size / (1024 * 1024)).toFixed(1)}MB)
-                                </span>
-                            ) : (
-                                <span className="text-muted-foreground">Choose audio file (max 5MB)</span>
+                        <div className="flex gap-2">
+                            <input
+                                ref={fileInputRef}
+                                type="file"
+                                accept="audio/*"
+                                onChange={(e) => handleFileSelect(e.target.files?.[0] ?? null)}
+                                className="hidden"
+                            />
+                            <Button
+                                type="button"
+                                variant="outline"
+                                size="sm"
+                                className="flex-1 justify-start text-sm font-normal"
+                                onClick={() => fileInputRef.current?.click()}
+                                disabled={isBusy}
+                            >
+                                <Upload className="w-4 h-4 mr-2 shrink-0" />
+                                {selectedFile && recordingStep !== "naming" ? (
+                                    <span className="truncate">
+                                        {selectedFile.name} ({(selectedFile.size / (1024 * 1024)).toFixed(1)}MB)
+                                    </span>
+                                ) : (
+                                    <span className="text-muted-foreground">Choose audio file (max 5MB)</span>
+                                )}
+                            </Button>
+                            {recordingStep === "idle" && (
+                                <Button
+                                    type="button"
+                                    variant="outline"
+                                    size="sm"
+                                    onClick={() => setRecordingStep("naming")}
+                                    disabled={uploading || isTranscribing}
+                                >
+                                    <Mic className="w-4 h-4 mr-1" />
+                                    Record
+                                </Button>
                            )}
-                        </Button>
+                        </div>
                    </div>
+
+                    {/* Recording: filename + start/stop */}
+                    {(recordingStep === "naming" || isRecording) && (
+                        <div className="space-y-2 rounded-md border border-dashed p-3 bg-muted/20">
+                            {recordingStep === "naming" && (
+                                <>
+                                    <div>
+                                        <Label className="text-xs text-muted-foreground">
+                                            Recording Name
+                                        </Label>
+                                        <Input
+                                            placeholder="e.g. greeting, hold-message"
+                                            value={recordingFilename}
+                                            onChange={(e) => setRecordingFilename(e.target.value)}
+                                            autoFocus
+                                        />
+                                    </div>
+                                    <div className="flex gap-2">
+                                        <Button
+                                            size="sm"
+                                            onClick={startRecording}
+                                            disabled={!recordingFilename.trim()}
+                                        >
+                                            <Mic className="w-4 h-4 mr-1" />
+                                            Start Recording
+                                        </Button>
+                                        <Button
+                                            size="sm"
+                                            variant="ghost"
+                                            onClick={resetRecordingState}
+                                        >
+                                            Cancel
+                                        </Button>
+                                    </div>
+                                </>
+                            )}
+                            {isRecording && (
+                                <div className="flex items-center gap-3">
+                                    <span className="relative flex h-3 w-3">
+                                        <span className="animate-ping absolute inline-flex h-full w-full rounded-full bg-red-400 opacity-75" />
+                                        <span className="relative inline-flex rounded-full h-3 w-3 bg-red-500" />
+                                    </span>
+                                    <span className="text-sm font-mono">
+                                        {Math.floor(recordingDuration / 60)}:{(recordingDuration % 60).toString().padStart(2, "0")}
+                                    </span>
+                                    <span className="text-xs text-muted-foreground">{recordingFilename}</span>
+                                    <Button
+                                        size="sm"
+                                        variant="destructive"
+                                        onClick={handleStopRecording}
+                                        className="ml-auto"
+                                    >
+                                        <Square className="w-4 h-4 mr-1" />
+                                        Stop
+                                    </Button>
+                                </div>
+                            )}
+                        </div>
+                    )}
+
+                    {/* Transcribing progress */}
+                    {isTranscribing && (
+                        <div className="flex items-center gap-2 text-sm text-muted-foreground">
+                            <Loader2 className="w-4 h-4 animate-spin" />
+                            Transcribing audio...
+                        </div>
+                    )}
+
+                    {/* Language */}
+                    <div>
+                        <Label className="text-xs text-muted-foreground">
+                            Language
+                        </Label>
+                        <Select value={language} onValueChange={setLanguage}>
+                            <SelectTrigger className="h-9 text-sm">
+                                <SelectValue />
+                            </SelectTrigger>
+                            <SelectContent>
+                                {Object.entries(LANGUAGE_DISPLAY_NAMES).map(([code, name]) => (
+                                    <SelectItem key={code} value={code}>
+                                        {name}
+                                    </SelectItem>
+                                ))}
+                            </SelectContent>
+                        </Select>
+                    </div>
+
+                    {/* Transcript */}
                    <div>
                        <Label className="text-xs text-muted-foreground">
                            Transcript
                        </Label>
-                        <Input
-                            placeholder="What does this recording say?"
+                        <Textarea
+                            placeholder={isTranscribing ? "Transcribing..." : "What does this recording say?"}
                            value={transcript}
                            onChange={(e) => setTranscript(e.target.value)}
+                            disabled={isTranscribing}
+                            rows={3}
+                            className="resize-none text-sm"
                        />
                    </div>
+
                    <Button
                        size="sm"
                        onClick={handleUpload}
-                        disabled={!selectedFile || !transcript.trim() || uploading}
+                        disabled={!selectedFile || !transcript.trim() || isBusy}
                    >
                        {uploading ? (
                            <Loader2 className="w-4 h-4 mr-1 animate-spin" />
--- a/ui/src/app/workflow/[workflowId]/hooks/useWorkflowState.ts
+++ b/ui/src/app/workflow/[workflowId]/hooks/useWorkflowState.ts
@ -363,7 +363,13 @@ export const useWorkflowState = ({
    // Save workflow function
    const saveWorkflow = useCallback(async (updateWorkflowDefinition: boolean = true) => {
        if (!user || !rfInstance.current) return;
-        const flow = rfInstance.current.toObject();
+        // Read nodes/edges from the Zustand store (synchronously up-to-date)
+        // and viewport from the ReactFlow instance to build the flow object.
+        // This avoids a race condition where rfInstance.toObject() may return
+        // stale node data if React hasn't re-rendered yet after a store update.
+        const { nodes: currentNodes, edges: currentEdges } = useWorkflowStore.getState();
+        const viewport = rfInstance.current.getViewport();
+        const flow = { nodes: currentNodes, edges: currentEdges, viewport };
        try {
            await updateWorkflowApiV1WorkflowWorkflowIdPut({
                path: {