From 4f95ca91a6b43f0f429da7bfd36033cf05f4b92e Mon Sep 17 00:00:00 2001 From: Arjun <6592213+arkml@users.noreply.github.com> Date: Thu, 26 Mar 2026 22:23:02 +0530 Subject: [PATCH] make voice input faster --- apps/x/apps/renderer/src/App.tsx | 9 +- .../x/apps/renderer/src/hooks/useVoiceMode.ts | 113 +++++++++++------- 2 files changed, 74 insertions(+), 48 deletions(-) diff --git a/apps/x/apps/renderer/src/App.tsx b/apps/x/apps/renderer/src/App.tsx index 591ef21e..415c53ec 100644 --- a/apps/x/apps/renderer/src/App.tsx +++ b/apps/x/apps/renderer/src/App.tsx @@ -697,13 +697,18 @@ function App() { window.ipc.invoke('oauth:getState', null), ]).then(([config, oauthState]) => { const rowboatConnected = oauthState.config?.rowboat?.connected ?? false - setVoiceAvailable(!!config.deepgram || rowboatConnected) + const hasVoice = !!config.deepgram || rowboatConnected + setVoiceAvailable(hasVoice) setTtsAvailable(!!config.elevenlabs || rowboatConnected) + // Pre-cache auth details so mic click skips IPC round-trips + if (hasVoice) { + voice.warmup() + } }).catch(() => { setVoiceAvailable(false) setTtsAvailable(false) }) - }, []) + }, [voice]) useEffect(() => { refreshVoiceAvailability() diff --git a/apps/x/apps/renderer/src/hooks/useVoiceMode.ts b/apps/x/apps/renderer/src/hooks/useVoiceMode.ts index 854ac9ea..96144453 100644 --- a/apps/x/apps/renderer/src/hooks/useVoiceMode.ts +++ b/apps/x/apps/renderer/src/hooks/useVoiceMode.ts @@ -16,6 +16,9 @@ const DEEPGRAM_PARAMS = new URLSearchParams({ }); const DEEPGRAM_LISTEN_URL = `wss://api.deepgram.com/v1/listen?${DEEPGRAM_PARAMS.toString()}`; +// Cache auth details so we don't need IPC round-trips on every mic click +let cachedAuth: { type: 'rowboat'; url: string; token: string } | { type: 'local'; apiKey: string } | null = null; + export function useVoiceMode() { const { refresh: refreshRowboatAccount } = useRowboatAccount(); const [state, setState] = useState('idle'); @@ -26,32 +29,54 @@ export function useVoiceMode() { const audioCtxRef = useRef(null); const transcriptBufferRef = useRef(''); const interimRef = useRef(''); + // Buffer audio chunks captured before the WebSocket is ready + const audioBufferRef = useRef([]); - // Connect (or reconnect) the Deepgram WebSocket. - // Refreshes Rowboat account before connect so access token is current. - const connectWs = useCallback(async () => { - if (wsRef.current && (wsRef.current.readyState === WebSocket.OPEN || wsRef.current.readyState === WebSocket.CONNECTING)) return; - - let ws: WebSocket; - + // Refresh cached auth details (called on warmup, not on mic click) + const refreshAuth = useCallback(async () => { const account = await refreshRowboatAccount(); if ( account?.signedIn && account.accessToken && account.config?.websocketApiUrl ) { - const listenUrl = buildDeepgramListenUrl(account.config.websocketApiUrl, DEEPGRAM_PARAMS); - ws = new WebSocket(listenUrl, ['bearer', account.accessToken]); + cachedAuth = { type: 'rowboat', url: account.config.websocketApiUrl, token: account.accessToken }; } else { - // Fall back to local API key (passed as subprotocol) const config = await window.ipc.invoke('voice:getConfig', null); - if (!config?.deepgram) return; - ws = new WebSocket(DEEPGRAM_LISTEN_URL, ['token', config.deepgram.apiKey]); + if (config?.deepgram) { + cachedAuth = { type: 'local', apiKey: config.deepgram.apiKey }; + } + } + }, [refreshRowboatAccount]); + + // Create and connect a Deepgram WebSocket using cached auth. + // Starts the connection and returns immediately (does not wait for open). + const connectWs = useCallback(async () => { + if (wsRef.current && (wsRef.current.readyState === WebSocket.OPEN || wsRef.current.readyState === WebSocket.CONNECTING)) return; + + // Refresh auth if we don't have it cached yet + if (!cachedAuth) { + await refreshAuth(); + } + if (!cachedAuth) return; + + let ws: WebSocket; + if (cachedAuth.type === 'rowboat') { + const listenUrl = buildDeepgramListenUrl(cachedAuth.url, DEEPGRAM_PARAMS); + ws = new WebSocket(listenUrl, ['bearer', cachedAuth.token]); + } else { + ws = new WebSocket(DEEPGRAM_LISTEN_URL, ['token', cachedAuth.apiKey]); } wsRef.current = ws; ws.onopen = () => { console.log('[voice] WebSocket connected'); + // Flush any buffered audio captured while we were connecting + const buffered = audioBufferRef.current; + audioBufferRef.current = []; + for (const chunk of buffered) { + ws.send(chunk); + } }; ws.onmessage = (event) => { @@ -73,13 +98,15 @@ export function useVoiceMode() { ws.onerror = () => { console.error('[voice] WebSocket error'); + // Auth may be stale — clear cache so next attempt refreshes + cachedAuth = null; }; ws.onclose = () => { console.log('[voice] WebSocket closed'); wsRef.current = null; }; - }, [refreshRowboatAccount]); + }, [refreshAuth]); // Stop audio capture and close WS const stopAudioCapture = useCallback(() => { @@ -100,6 +127,7 @@ export function useVoiceMode() { wsRef.current.close(); wsRef.current = null; } + audioBufferRef.current = []; setInterimText(''); transcriptBufferRef.current = ''; interimRef.current = ''; @@ -112,45 +140,28 @@ export function useVoiceMode() { transcriptBufferRef.current = ''; interimRef.current = ''; setInterimText(''); + audioBufferRef.current = []; - // If WS isn't connected, connect and wait for it - if (!wsRef.current || wsRef.current.readyState !== WebSocket.OPEN) { - setState('connecting'); - connectWs(); - // Wait for WS to be ready (up to 5 seconds) - const wsOk = await new Promise((resolve) => { - const checkInterval = setInterval(() => { - if (wsRef.current?.readyState === WebSocket.OPEN) { - clearInterval(checkInterval); - resolve(true); - } - }, 50); - setTimeout(() => { - clearInterval(checkInterval); - resolve(false); - }, 5000); - }); - if (!wsOk) { - setState('idle'); - return; - } - } - + // Show listening immediately — don't wait for WebSocket setState('listening'); - // Start mic - let stream: MediaStream | null = null; - try { - stream = await navigator.mediaDevices.getUserMedia({ audio: true }); - } catch (err) { - console.error('Microphone access denied:', err); + // Kick off mic + WebSocket in parallel, don't await WebSocket + const [stream] = await Promise.all([ + navigator.mediaDevices.getUserMedia({ audio: true }).catch((err) => { + console.error('Microphone access denied:', err); + return null; + }), + connectWs(), + ]); + + if (!stream) { setState('idle'); return; } mediaStreamRef.current = stream; - // Start audio capture + // Start audio capture immediately — buffer if WS isn't open yet const audioCtx = new AudioContext({ sampleRate: 16000 }); audioCtxRef.current = audioCtx; const source = audioCtx.createMediaStreamSource(stream); @@ -158,14 +169,19 @@ export function useVoiceMode() { processorRef.current = processor; processor.onaudioprocess = (e) => { - if (!wsRef.current || wsRef.current.readyState !== WebSocket.OPEN) return; const float32 = e.inputBuffer.getChannelData(0); const int16 = new Int16Array(float32.length); for (let i = 0; i < float32.length; i++) { const s = Math.max(-1, Math.min(1, float32[i])); int16[i] = s < 0 ? s * 0x8000 : s * 0x7fff; } - wsRef.current.send(int16.buffer); + const buffer = int16.buffer; + if (wsRef.current?.readyState === WebSocket.OPEN) { + wsRef.current.send(buffer); + } else { + // WebSocket still connecting — buffer the audio + audioBufferRef.current.push(buffer); + } }; source.connect(processor); @@ -188,5 +204,10 @@ export function useVoiceMode() { stopAudioCapture(); }, [stopAudioCapture]); - return { state, interimText, start, submit, cancel }; + /** Pre-cache auth details so mic click skips IPC round-trips */ + const warmup = useCallback(() => { + refreshAuth().catch(() => {}); + }, [refreshAuth]); + + return { state, interimText, start, submit, cancel, warmup }; }