voice mode with TTS input/output

This commit is contained in:
Arjun 2026-03-13 10:26:08 +05:30 committed by arkml
parent d150294af1
commit 47d5118448
17 changed files with 937 additions and 15 deletions

View file

@ -38,7 +38,7 @@ import { IAgentScheduleRepo } from '@x/core/dist/agent-schedule/repo.js';
import { IAgentScheduleStateRepo } from '@x/core/dist/agent-schedule/state-repo.js';
import { triggerRun as triggerAgentScheduleRun } from '@x/core/dist/agent-schedule/runner.js';
import { search } from '@x/core/dist/search/search.js';
import { versionHistory } from '@x/core';
import { versionHistory, voice } from '@x/core';
import { classifySchedule } from '@x/core/dist/knowledge/inline_tasks.js';
type InvokeChannels = ipc.InvokeChannels;
@ -352,7 +352,7 @@ export function setupIpcHandlers() {
return runsCore.createRun(args);
},
'runs:createMessage': async (_event, args) => {
return { messageId: await runsCore.createMessage(args.runId, args.message) };
return { messageId: await runsCore.createMessage(args.runId, args.message, args.voiceInput, args.voiceOutput) };
},
'runs:authorizePermission': async (_event, args) => {
await runsCore.authorizePermission(args.runId, args.authorization);
@ -571,5 +571,11 @@ export function setupIpcHandlers() {
const schedule = await classifySchedule(args.instruction);
return { schedule };
},
'voice:getConfig': async () => {
return voice.getVoiceConfig();
},
'voice:synthesize': async (_event, args) => {
return voice.synthesizeSpeech(args.text);
},
});
}

View file

@ -1,4 +1,4 @@
import { app, BrowserWindow, protocol, net, shell } from "electron";
import { app, BrowserWindow, protocol, net, shell, session } from "electron";
import path from "node:path";
import {
setupIpcHandlers,
@ -92,6 +92,15 @@ function createWindow() {
},
});
// Grant microphone permission for voice mode
session.defaultSession.setPermissionRequestHandler((_webContents, permission, callback) => {
if (permission === 'media') {
callback(true);
} else {
callback(false);
}
});
// Show window when content is ready to prevent blank screen
win.once("ready-to-show", () => {
win.show();

View file

@ -76,6 +76,8 @@ import {
import { AgentScheduleConfig } from '@x/shared/dist/agent-schedule.js'
import { AgentScheduleState } from '@x/shared/dist/agent-schedule-state.js'
import { toast } from "sonner"
import { useVoiceMode } from '@/hooks/useVoiceMode'
import { useVoiceTTS } from '@/hooks/useVoiceTTS'
type DirEntry = z.infer<typeof workspace.DirEntry>
type RunEventType = z.infer<typeof RunEvent>
@ -546,6 +548,87 @@ function App() {
const [agentId] = useState<string>('copilot')
const [presetMessage, setPresetMessage] = useState<string | undefined>(undefined)
// Voice mode state
const [voiceAvailable, setVoiceAvailable] = useState(false)
const [ttsAvailable, setTtsAvailable] = useState(false)
const [ttsEnabled, setTtsEnabled] = useState(false)
const ttsEnabledRef = useRef(false)
const [ttsMode, setTtsMode] = useState<'summary' | 'full'>('summary')
const ttsModeRef = useRef<'summary' | 'full'>('summary')
const [isRecording, setIsRecording] = useState(false)
const voiceTextBufferRef = useRef('')
const spokenIndexRef = useRef(0)
const isRecordingRef = useRef(false)
const tts = useVoiceTTS()
const ttsRef = useRef(tts)
ttsRef.current = tts
const voice = useVoiceMode()
const voiceRef = useRef(voice)
voiceRef.current = voice
// Check if voice is available on mount
useEffect(() => {
window.ipc.invoke('voice:getConfig', null).then(config => {
setVoiceAvailable(!!config.deepgram)
setTtsAvailable(!!config.elevenlabs)
}).catch(() => {
setVoiceAvailable(false)
setTtsAvailable(false)
})
}, [])
const handleStartRecording = useCallback(() => {
setIsRecording(true)
isRecordingRef.current = true
voice.start()
}, [voice])
const handlePromptSubmitRef = useRef<((msg: { text: string }) => void) | null>(null)
const pendingVoiceInputRef = useRef(false)
const handleSubmitRecording = useCallback(() => {
const text = voice.submit()
setIsRecording(false)
isRecordingRef.current = false
if (text) {
pendingVoiceInputRef.current = true
handlePromptSubmitRef.current?.({ text })
}
}, [voice])
const handleToggleTts = useCallback(() => {
setTtsEnabled(prev => {
const next = !prev
ttsEnabledRef.current = next
if (!next) {
ttsRef.current.cancel()
}
return next
})
}, [])
const handleTtsModeChange = useCallback((mode: 'summary' | 'full') => {
setTtsMode(mode)
ttsModeRef.current = mode
}, [])
const handleCancelRecording = useCallback(() => {
voice.cancel()
setIsRecording(false)
isRecordingRef.current = false
}, [voice])
// Helper to cancel recording from any navigation handler
const cancelRecordingIfActive = useCallback(() => {
if (isRecordingRef.current) {
voiceRef.current.cancel()
setIsRecording(false)
isRecordingRef.current = false
}
}, [])
// Runs history state
type RunListItem = { id: string; title?: string; createdAt: string; agentId: string }
const [runs, setRuns] = useState<RunListItem[]>([])
@ -1496,6 +1579,9 @@ function App() {
if (!isActiveRun) return
setIsProcessing(true)
setModelUsage(null)
// Reset voice buffer for new response
voiceTextBufferRef.current = ''
spokenIndexRef.current = 0
break
case 'run-processing-end':
@ -1545,6 +1631,20 @@ function App() {
if (llmEvent.type === 'text-delta' && llmEvent.delta) {
appendStreamingBuffer(event.runId, llmEvent.delta)
setCurrentAssistantMessage(prev => prev + llmEvent.delta)
// Extract <voice> tags and send to TTS when enabled
voiceTextBufferRef.current += llmEvent.delta
const remaining = voiceTextBufferRef.current.substring(spokenIndexRef.current)
const voiceRegex = /<voice>([\s\S]*?)<\/voice>/g
let voiceMatch: RegExpExecArray | null
while ((voiceMatch = voiceRegex.exec(remaining)) !== null) {
const voiceContent = voiceMatch[1].trim()
console.log('[voice] extracted voice tag:', voiceContent)
if (voiceContent && ttsEnabledRef.current) {
ttsRef.current.speak(voiceContent)
}
spokenIndexRef.current += voiceMatch.index + voiceMatch[0].length
}
} else if (llmEvent.type === 'tool-call') {
setConversation(prev => [...prev, {
id: llmEvent.toolCallId || `tool-${Date.now()}`,
@ -1584,6 +1684,7 @@ function App() {
if (msg.role === 'assistant') {
setCurrentAssistantMessage(currentMsg => {
if (currentMsg) {
const cleanedContent = currentMsg.replace(/<\/?voice>/g, '')
setConversation(prev => {
const exists = prev.some(m =>
m.id === event.messageId && 'role' in m && m.role === 'assistant'
@ -1592,7 +1693,7 @@ function App() {
return [...prev, {
id: event.messageId,
role: 'assistant',
content: currentMsg,
content: cleanedContent,
timestamp: Date.now(),
}]
})
@ -1887,6 +1988,8 @@ function App() {
await window.ipc.invoke('runs:createMessage', {
runId: currentRunId,
message: attachmentPayload,
voiceInput: pendingVoiceInputRef.current || undefined,
voiceOutput: ttsEnabledRef.current ? ttsModeRef.current : undefined,
})
} else {
// Legacy path: plain string with optional XML-formatted @mentions.
@ -1915,11 +2018,15 @@ function App() {
await window.ipc.invoke('runs:createMessage', {
runId: currentRunId,
message: formattedMessage,
voiceInput: pendingVoiceInputRef.current || undefined,
voiceOutput: ttsEnabledRef.current ? ttsModeRef.current : undefined,
})
titleSource = formattedMessage
}
pendingVoiceInputRef.current = false
if (isNewRun) {
const inferredTitle = inferRunTitleFromMessage(titleSource)
setRuns((prev) => {
@ -1936,6 +2043,7 @@ function App() {
console.error('Failed to send message:', error)
}
}
handlePromptSubmitRef.current = handlePromptSubmit
const handleStop = useCallback(async () => {
if (!runId) return
@ -2065,6 +2173,7 @@ function App() {
}, [])
const openChatInNewTab = useCallback((targetRunId: string) => {
cancelRecordingIfActive()
const existingTab = chatTabs.find(t => t.runId === targetRunId)
if (existingTab) {
// Cancel stale in-flight loads from previously focused tabs.
@ -2080,12 +2189,18 @@ function App() {
setChatTabs(prev => [...prev, { id, runId: targetRunId }])
setActiveChatTabId(id)
loadRun(targetRunId)
}, [chatTabs, loadRun, restoreChatTabState])
}, [chatTabs, loadRun, restoreChatTabState, cancelRecordingIfActive])
const switchChatTab = useCallback((tabId: string) => {
const tab = chatTabs.find(t => t.id === tabId)
if (!tab) return
if (tabId === activeChatTabId) return
// Cancel any active recording when switching tabs
if (isRecordingRef.current) {
voiceRef.current.cancel()
setIsRecording(false)
isRecordingRef.current = false
}
saveChatScrollForTab(activeChatTabId)
// Cancel stale in-flight loads from previously focused tabs.
loadRunRequestIdRef.current += 1
@ -2471,13 +2586,14 @@ function App() {
const current = currentViewState
if (viewStatesEqual(current, nextView)) return
cancelRecordingIfActive()
const nextHistory = {
back: appendUnique(historyRef.current.back, current),
forward: [] as ViewState[],
}
setHistory(nextHistory)
await applyViewState(nextView)
}, [appendUnique, applyViewState, currentViewState, setHistory])
}, [appendUnique, applyViewState, cancelRecordingIfActive, currentViewState, setHistory])
const navigateBack = useCallback(async () => {
const { back, forward } = historyRef.current
@ -3412,6 +3528,7 @@ function App() {
tasksActions={{
onNewChat: handleNewChatTab,
onSelectRun: (runIdToLoad) => {
cancelRecordingIfActive()
if (selectedPath || isGraphOpen) {
setIsChatSidebarOpen(true)
}
@ -3814,7 +3931,7 @@ function App() {
{tabState.currentAssistantMessage && (
<Message from="assistant">
<MessageContent>
<MessageResponse components={streamdownComponents}>{tabState.currentAssistantMessage}</MessageResponse>
<MessageResponse components={streamdownComponents}>{tabState.currentAssistantMessage.replace(/<\/?voice>/g, '')}</MessageResponse>
</MessageContent>
</Message>
)}
@ -3865,6 +3982,18 @@ function App() {
runId={tabState.runId}
initialDraft={chatDraftsRef.current.get(tab.id)}
onDraftChange={(text) => setChatDraftForTab(tab.id, text)}
isRecording={isActive && isRecording}
recordingText={isActive ? voice.interimText : undefined}
recordingState={isActive ? (voice.state === 'connecting' ? 'connecting' : 'listening') : undefined}
onStartRecording={isActive ? handleStartRecording : undefined}
onSubmitRecording={isActive ? handleSubmitRecording : undefined}
onCancelRecording={isActive ? handleCancelRecording : undefined}
voiceAvailable={isActive && voiceAvailable}
ttsAvailable={isActive && ttsAvailable}
ttsEnabled={ttsEnabled}
ttsMode={ttsMode}
onToggleTts={isActive ? handleToggleTts : undefined}
onTtsModeChange={isActive ? handleTtsModeChange : undefined}
/>
</div>
)
@ -3914,6 +4043,18 @@ function App() {
onToolOpenChangeForTab={setToolOpenForTab}
onOpenKnowledgeFile={(path) => { navigateToFile(path) }}
onActivate={() => setActiveShortcutPane('right')}
isRecording={isRecording}
recordingText={voice.interimText}
recordingState={voice.state === 'connecting' ? 'connecting' : 'listening'}
onStartRecording={handleStartRecording}
onSubmitRecording={handleSubmitRecording}
onCancelRecording={handleCancelRecording}
voiceAvailable={voiceAvailable}
ttsAvailable={ttsAvailable}
ttsEnabled={ttsEnabled}
ttsMode={ttsMode}
onToggleTts={handleToggleTts}
onTtsModeChange={handleTtsModeChange}
/>
)}
{/* Rendered last so its no-drag region paints over the sidebar drag region */}

View file

@ -1,4 +1,5 @@
import { useCallback, useEffect, useRef, useState } from 'react'
import { Tooltip, TooltipContent, TooltipTrigger } from '@/components/ui/tooltip'
import {
ArrowUp,
AudioLines,
@ -9,7 +10,9 @@ import {
FileSpreadsheet,
FileText,
FileVideo,
Headphones,
LoaderIcon,
Mic,
Plus,
Square,
X,
@ -102,6 +105,18 @@ interface ChatInputInnerProps {
runId?: string | null
initialDraft?: string
onDraftChange?: (text: string) => void
isRecording?: boolean
recordingText?: string
recordingState?: 'connecting' | 'listening'
onStartRecording?: () => void
onSubmitRecording?: () => void
onCancelRecording?: () => void
voiceAvailable?: boolean
ttsAvailable?: boolean
ttsEnabled?: boolean
ttsMode?: 'summary' | 'full'
onToggleTts?: () => void
onTtsModeChange?: (mode: 'summary' | 'full') => void
}
function ChatInputInner({
@ -115,6 +130,18 @@ function ChatInputInner({
runId,
initialDraft,
onDraftChange,
isRecording,
recordingText,
recordingState,
onStartRecording,
onSubmitRecording,
onCancelRecording,
voiceAvailable,
ttsAvailable,
ttsEnabled,
ttsMode,
onToggleTts,
onTtsModeChange,
}: ChatInputInnerProps) {
const controller = usePromptInputController()
const message = controller.textInput.value
@ -367,6 +394,40 @@ function ChatInputInner({
e.target.value = ''
}}
/>
{isRecording ? (
/* ── Recording bar ── */
<div className="flex items-center gap-3 px-4 py-3">
<button
type="button"
onClick={onCancelRecording}
className="flex h-7 w-7 shrink-0 items-center justify-center rounded-full text-muted-foreground transition-colors hover:bg-muted hover:text-foreground"
aria-label="Cancel recording"
>
<X className="h-4 w-4" />
</button>
<div className="flex flex-1 items-center gap-2 overflow-hidden">
<VoiceWaveform />
<span className="min-w-0 flex-1 truncate text-sm text-muted-foreground">
{recordingState === 'connecting' ? 'Connecting...' : recordingText || 'Listening...'}
</span>
</div>
<Button
size="icon"
onClick={onSubmitRecording}
disabled={!recordingText?.trim()}
className={cn(
'h-7 w-7 shrink-0 rounded-full transition-all',
recordingText?.trim()
? 'bg-primary text-primary-foreground hover:bg-primary/90'
: 'bg-muted text-muted-foreground'
)}
>
<ArrowUp className="h-4 w-4" />
</Button>
</div>
) : (
/* ── Normal input ── */
<>
<div className="px-4 pt-4 pb-2">
<PromptInputTextarea
placeholder="Type your message..."
@ -414,6 +475,63 @@ function ChatInputInner({
</DropdownMenuContent>
</DropdownMenu>
)}
{onToggleTts && ttsAvailable && (
<div className="flex shrink-0 items-center">
<Tooltip>
<TooltipTrigger asChild>
<button
type="button"
onClick={onToggleTts}
className={cn(
'relative flex h-7 w-7 shrink-0 items-center justify-center rounded-full transition-colors',
ttsEnabled
? 'text-foreground hover:bg-muted'
: 'text-muted-foreground hover:bg-muted hover:text-foreground'
)}
aria-label={ttsEnabled ? 'Disable voice output' : 'Enable voice output'}
>
<Headphones className="h-4 w-4" />
{!ttsEnabled && (
<span className="absolute inset-0 flex items-center justify-center pointer-events-none">
<span className="block h-[1.5px] w-5 -rotate-45 rounded-full bg-muted-foreground" />
</span>
)}
</button>
</TooltipTrigger>
<TooltipContent side="top">
{ttsEnabled ? 'Voice output on' : 'Voice output off'}
</TooltipContent>
</Tooltip>
{ttsEnabled && onTtsModeChange && (
<DropdownMenu>
<DropdownMenuTrigger asChild>
<button
type="button"
className="flex h-7 w-4 shrink-0 items-center justify-center text-muted-foreground transition-colors hover:text-foreground"
>
<ChevronDown className="h-3 w-3" />
</button>
</DropdownMenuTrigger>
<DropdownMenuContent align="end">
<DropdownMenuRadioGroup value={ttsMode ?? 'summary'} onValueChange={(v) => onTtsModeChange(v as 'summary' | 'full')}>
<DropdownMenuRadioItem value="summary">Speak summary</DropdownMenuRadioItem>
<DropdownMenuRadioItem value="full">Speak full response</DropdownMenuRadioItem>
</DropdownMenuRadioGroup>
</DropdownMenuContent>
</DropdownMenu>
)}
</div>
)}
{voiceAvailable && onStartRecording && (
<button
type="button"
onClick={onStartRecording}
className="flex h-7 w-7 shrink-0 items-center justify-center rounded-full text-muted-foreground transition-colors hover:bg-muted hover:text-foreground"
aria-label="Voice input"
>
<Mic className="h-4 w-4" />
</button>
)}
{isProcessing ? (
<Button
size="icon"
@ -448,6 +566,31 @@ function ChatInputInner({
</Button>
)}
</div>
</>
)}
</div>
)
}
/** Animated waveform bars for the recording indicator */
function VoiceWaveform() {
return (
<div className="flex items-center gap-[3px] h-5">
{[0, 1, 2, 3, 4].map((i) => (
<span
key={i}
className="w-[3px] rounded-full bg-primary"
style={{
animation: `voice-wave 1.2s ease-in-out ${i * 0.15}s infinite`,
}}
/>
))}
<style>{`
@keyframes voice-wave {
0%, 100% { height: 4px; }
50% { height: 16px; }
}
`}</style>
</div>
)
}
@ -466,6 +609,18 @@ export interface ChatInputWithMentionsProps {
runId?: string | null
initialDraft?: string
onDraftChange?: (text: string) => void
isRecording?: boolean
recordingText?: string
recordingState?: 'connecting' | 'listening'
onStartRecording?: () => void
onSubmitRecording?: () => void
onCancelRecording?: () => void
voiceAvailable?: boolean
ttsAvailable?: boolean
ttsEnabled?: boolean
ttsMode?: 'summary' | 'full'
onToggleTts?: () => void
onTtsModeChange?: (mode: 'summary' | 'full') => void
}
export function ChatInputWithMentions({
@ -482,6 +637,18 @@ export function ChatInputWithMentions({
runId,
initialDraft,
onDraftChange,
isRecording,
recordingText,
recordingState,
onStartRecording,
onSubmitRecording,
onCancelRecording,
voiceAvailable,
ttsAvailable,
ttsEnabled,
ttsMode,
onToggleTts,
onTtsModeChange,
}: ChatInputWithMentionsProps) {
return (
<PromptInputProvider knowledgeFiles={knowledgeFiles} recentFiles={recentFiles} visibleFiles={visibleFiles}>
@ -496,6 +663,18 @@ export function ChatInputWithMentions({
runId={runId}
initialDraft={initialDraft}
onDraftChange={onDraftChange}
isRecording={isRecording}
recordingText={recordingText}
recordingState={recordingState}
onStartRecording={onStartRecording}
onSubmitRecording={onSubmitRecording}
onCancelRecording={onCancelRecording}
voiceAvailable={voiceAvailable}
ttsAvailable={ttsAvailable}
ttsEnabled={ttsEnabled}
ttsMode={ttsMode}
onToggleTts={onToggleTts}
onTtsModeChange={onTtsModeChange}
/>
</PromptInputProvider>
)

View file

@ -108,6 +108,19 @@ interface ChatSidebarProps {
onToolOpenChangeForTab?: (tabId: string, toolId: string, open: boolean) => void
onOpenKnowledgeFile?: (path: string) => void
onActivate?: () => void
// Voice / TTS props
isRecording?: boolean
recordingText?: string
recordingState?: 'connecting' | 'listening'
onStartRecording?: () => void
onSubmitRecording?: () => void
onCancelRecording?: () => void
voiceAvailable?: boolean
ttsAvailable?: boolean
ttsEnabled?: boolean
ttsMode?: 'summary' | 'full'
onToggleTts?: () => void
onTtsModeChange?: (mode: 'summary' | 'full') => void
}
export function ChatSidebar({
@ -146,6 +159,18 @@ export function ChatSidebar({
onToolOpenChangeForTab,
onOpenKnowledgeFile,
onActivate,
isRecording,
recordingText,
recordingState,
onStartRecording,
onSubmitRecording,
onCancelRecording,
voiceAvailable,
ttsAvailable,
ttsEnabled,
ttsMode,
onToggleTts,
onTtsModeChange,
}: ChatSidebarProps) {
const [width, setWidth] = useState(() => getInitialPaneWidth(defaultWidth))
const [isResizing, setIsResizing] = useState(false)
@ -542,6 +567,18 @@ export function ChatSidebar({
runId={tabState.runId}
initialDraft={getInitialDraft?.(tab.id)}
onDraftChange={onDraftChangeForTab ? (text) => onDraftChangeForTab(tab.id, text) : undefined}
isRecording={isActive && isRecording}
recordingText={isActive ? recordingText : undefined}
recordingState={isActive ? recordingState : undefined}
onStartRecording={isActive ? onStartRecording : undefined}
onSubmitRecording={isActive ? onSubmitRecording : undefined}
onCancelRecording={isActive ? onCancelRecording : undefined}
voiceAvailable={isActive && voiceAvailable}
ttsAvailable={isActive && ttsAvailable}
ttsEnabled={ttsEnabled}
ttsMode={ttsMode}
onToggleTts={isActive ? onToggleTts : undefined}
onTtsModeChange={isActive ? onTtsModeChange : undefined}
/>
</div>
)

View file

@ -0,0 +1,218 @@
import { useCallback, useEffect, useRef, useState } from 'react';
export type VoiceState = 'idle' | 'connecting' | 'listening';
// Cache the API key so we skip the IPC call after first use
let cachedApiKey: string | null = null;
let apiKeyFetched = false;
export function useVoiceMode() {
const [state, setState] = useState<VoiceState>('idle');
const [interimText, setInterimText] = useState('');
const wsRef = useRef<WebSocket | null>(null);
const mediaStreamRef = useRef<MediaStream | null>(null);
const processorRef = useRef<ScriptProcessorNode | null>(null);
const audioCtxRef = useRef<AudioContext | null>(null);
const transcriptBufferRef = useRef('');
const interimRef = useRef('');
const reconnectTimerRef = useRef<ReturnType<typeof setTimeout> | null>(null);
const mountedRef = useRef(true);
// Connect (or reconnect) the Deepgram WebSocket.
// The WS stays open while the hook is mounted; only audio capture starts/stops per recording.
const connectWs = useCallback(() => {
if (!cachedApiKey) return;
if (wsRef.current && (wsRef.current.readyState === WebSocket.OPEN || wsRef.current.readyState === WebSocket.CONNECTING)) return;
const ws = new WebSocket(
`wss://api.deepgram.com/v1/listen?model=nova-3&encoding=linear16&sample_rate=16000&channels=1&interim_results=true&smart_format=true&punctuate=true&language=en`,
['token', cachedApiKey]
);
wsRef.current = ws;
ws.onopen = () => {
console.log('[voice] WebSocket connected');
};
ws.onmessage = (event) => {
const data = JSON.parse(event.data);
if (!data.channel?.alternatives?.[0]) return;
const transcript = data.channel.alternatives[0].transcript;
if (!transcript) return;
if (data.is_final) {
transcriptBufferRef.current += (transcriptBufferRef.current ? ' ' : '') + transcript;
interimRef.current = '';
setInterimText(transcriptBufferRef.current);
} else {
interimRef.current = transcript;
setInterimText(transcriptBufferRef.current + (transcriptBufferRef.current ? ' ' : '') + transcript);
}
};
ws.onerror = () => {
console.error('[voice] WebSocket error');
};
ws.onclose = () => {
console.log('[voice] WebSocket closed');
wsRef.current = null;
// Auto-reconnect after 3 seconds if still mounted
if (mountedRef.current && cachedApiKey) {
reconnectTimerRef.current = setTimeout(() => {
if (mountedRef.current) connectWs();
}, 3000);
}
};
}, []);
// Fetch API key on mount and establish persistent WebSocket
useEffect(() => {
mountedRef.current = true;
const init = async () => {
if (!apiKeyFetched) {
apiKeyFetched = true;
try {
const config = await window.ipc.invoke('voice:getConfig', null);
cachedApiKey = config.deepgram?.apiKey ?? null;
} catch { /* ignore */ }
}
if (cachedApiKey && mountedRef.current) {
connectWs();
}
};
void init();
return () => {
mountedRef.current = false;
if (reconnectTimerRef.current) {
clearTimeout(reconnectTimerRef.current);
reconnectTimerRef.current = null;
}
// Close WS on unmount, suppress reconnect by nulling onclose
if (wsRef.current) {
wsRef.current.onclose = null;
wsRef.current.close();
wsRef.current = null;
}
};
}, [connectWs]);
// Stop only audio capture (mic + processor), leaving WS open
const stopAudioCapture = useCallback(() => {
if (processorRef.current) {
processorRef.current.disconnect();
processorRef.current = null;
}
if (audioCtxRef.current) {
audioCtxRef.current.close();
audioCtxRef.current = null;
}
if (mediaStreamRef.current) {
mediaStreamRef.current.getTracks().forEach(t => t.stop());
mediaStreamRef.current = null;
}
setInterimText('');
transcriptBufferRef.current = '';
interimRef.current = '';
setState('idle');
}, []);
const start = useCallback(async () => {
if (state !== 'idle') return;
// Ensure we have an API key
if (!cachedApiKey) {
try {
const config = await window.ipc.invoke('voice:getConfig', null);
cachedApiKey = config.deepgram?.apiKey ?? null;
} catch { /* ignore */ }
}
if (!cachedApiKey) {
console.error('Deepgram not configured');
return;
}
transcriptBufferRef.current = '';
interimRef.current = '';
setInterimText('');
// If WS isn't connected, connect and wait for it
if (!wsRef.current || wsRef.current.readyState !== WebSocket.OPEN) {
setState('connecting');
connectWs();
// Wait for WS to be ready (up to 5 seconds)
const wsOk = await new Promise<boolean>((resolve) => {
const checkInterval = setInterval(() => {
if (wsRef.current?.readyState === WebSocket.OPEN) {
clearInterval(checkInterval);
resolve(true);
}
}, 50);
setTimeout(() => {
clearInterval(checkInterval);
resolve(false);
}, 5000);
});
if (!wsOk) {
setState('idle');
return;
}
}
setState('listening');
// Start mic
let stream: MediaStream | null = null;
try {
stream = await navigator.mediaDevices.getUserMedia({ audio: true });
} catch (err) {
console.error('Microphone access denied:', err);
setState('idle');
return;
}
mediaStreamRef.current = stream;
// Start audio capture
const audioCtx = new AudioContext({ sampleRate: 16000 });
audioCtxRef.current = audioCtx;
const source = audioCtx.createMediaStreamSource(stream);
const processor = audioCtx.createScriptProcessor(4096, 1, 1);
processorRef.current = processor;
processor.onaudioprocess = (e) => {
if (!wsRef.current || wsRef.current.readyState !== WebSocket.OPEN) return;
const float32 = e.inputBuffer.getChannelData(0);
const int16 = new Int16Array(float32.length);
for (let i = 0; i < float32.length; i++) {
const s = Math.max(-1, Math.min(1, float32[i]));
int16[i] = s < 0 ? s * 0x8000 : s * 0x7fff;
}
wsRef.current.send(int16.buffer);
};
source.connect(processor);
processor.connect(audioCtx.destination);
}, [state, connectWs]);
/** Stop recording and return the full transcript (finalized + any current interim) */
const submit = useCallback((): string => {
let text = transcriptBufferRef.current;
if (interimRef.current) {
text += (text ? ' ' : '') + interimRef.current;
}
text = text.trim();
stopAudioCapture();
return text;
}, [stopAudioCapture]);
/** Cancel recording without returning transcript */
const cancel = useCallback(() => {
stopAudioCapture();
}, [stopAudioCapture]);
return { state, interimText, start, submit, cancel };
}

View file

@ -0,0 +1,72 @@
import { useCallback, useRef, useState } from 'react';
export type TTSState = 'idle' | 'synthesizing' | 'speaking';
export function useVoiceTTS() {
const [state, setState] = useState<TTSState>('idle');
const audioRef = useRef<HTMLAudioElement | null>(null);
const queueRef = useRef<string[]>([]);
const processingRef = useRef(false);
const processQueue = useCallback(async () => {
if (processingRef.current) return;
processingRef.current = true;
while (queueRef.current.length > 0) {
const text = queueRef.current.shift()!;
if (!text.trim()) continue;
setState('synthesizing');
console.log('[tts] synthesizing:', text.substring(0, 80));
try {
const result = await window.ipc.invoke('voice:synthesize', { text });
console.log('[tts] got audio, mimeType:', result.mimeType, 'base64 length:', result.audioBase64.length);
setState('speaking');
await new Promise<void>((resolve, reject) => {
const dataUrl = `data:${result.mimeType};base64,${result.audioBase64}`;
const audio = new Audio(dataUrl);
audioRef.current = audio;
audio.onended = () => {
console.log('[tts] audio ended');
resolve();
};
audio.onerror = (e) => {
console.error('[tts] audio error:', e);
reject(new Error('Audio playback failed'));
};
audio.play().then(() => {
console.log('[tts] audio playing');
}).catch((err) => {
console.error('[tts] play() rejected:', err);
reject(err);
});
});
} catch (err) {
console.error('[tts] error:', err);
}
}
audioRef.current = null;
processingRef.current = false;
setState('idle');
}, []);
const speak = useCallback((text: string) => {
console.log('[tts] speak() called:', text.substring(0, 80));
queueRef.current.push(text);
processQueue();
}, [processQueue]);
const cancel = useCallback(() => {
queueRef.current = [];
if (audioRef.current) {
audioRef.current.pause();
audioRef.current = null;
}
processingRef.current = false;
setState('idle');
}, []);
return { state, speak, cancel };
}