From 4f95ca91a6b43f0f429da7bfd36033cf05f4b92e Mon Sep 17 00:00:00 2001
From: Arjun <6592213+arkml@users.noreply.github.com>
Date: Thu, 26 Mar 2026 22:23:02 +0530
Subject: [PATCH] make voice input faster

---
 apps/x/apps/renderer/src/App.tsx              |   9 +-
 .../x/apps/renderer/src/hooks/useVoiceMode.ts | 113 +++++++++++-------
 2 files changed, 74 insertions(+), 48 deletions(-)
diff --git a/apps/x/apps/renderer/src/App.tsx b/apps/x/apps/renderer/src/App.tsx
index 591ef21e..415c53ec 100644
--- a/apps/x/apps/renderer/src/App.tsx
+++ b/apps/x/apps/renderer/src/App.tsx
@@ -697,13 +697,18 @@ function App() {
       window.ipc.invoke('oauth:getState', null),
     ]).then(([config, oauthState]) => {
       const rowboatConnected = oauthState.config?.rowboat?.connected ?? false
-      setVoiceAvailable(!!config.deepgram || rowboatConnected)
+      const hasVoice = !!config.deepgram || rowboatConnected
+      setVoiceAvailable(hasVoice)
       setTtsAvailable(!!config.elevenlabs || rowboatConnected)
+      // Pre-cache auth details so mic click skips IPC round-trips
+      if (hasVoice) {
+        voice.warmup()
+      }
     }).catch(() => {
       setVoiceAvailable(false)
       setTtsAvailable(false)
     })
-  }, [])
+  }, [voice])
 
   useEffect(() => {
     refreshVoiceAvailability()
diff --git a/apps/x/apps/renderer/src/hooks/useVoiceMode.ts b/apps/x/apps/renderer/src/hooks/useVoiceMode.ts
index 854ac9ea..96144453 100644
--- a/apps/x/apps/renderer/src/hooks/useVoiceMode.ts
+++ b/apps/x/apps/renderer/src/hooks/useVoiceMode.ts
@@ -16,6 +16,9 @@ const DEEPGRAM_PARAMS = new URLSearchParams({
 });
 const DEEPGRAM_LISTEN_URL = `wss://api.deepgram.com/v1/listen?${DEEPGRAM_PARAMS.toString()}`;
 
+// Cache auth details so we don't need IPC round-trips on every mic click
+let cachedAuth: { type: 'rowboat'; url: string; token: string } | { type: 'local'; apiKey: string } | null = null;
+
 export function useVoiceMode() {
     const { refresh: refreshRowboatAccount } = useRowboatAccount();
     const [state, setState] = useState<VoiceState>('idle');
@@ -26,32 +29,54 @@ export function useVoiceMode() {
     const audioCtxRef = useRef<AudioContext | null>(null);
     const transcriptBufferRef = useRef('');
     const interimRef = useRef('');
+    // Buffer audio chunks captured before the WebSocket is ready
+    const audioBufferRef = useRef<ArrayBuffer[]>([]);
 
-    // Connect (or reconnect) the Deepgram WebSocket.
-    // Refreshes Rowboat account before connect so access token is current.
-    const connectWs = useCallback(async () => {
-        if (wsRef.current && (wsRef.current.readyState === WebSocket.OPEN || wsRef.current.readyState === WebSocket.CONNECTING)) return;
-
-        let ws: WebSocket;
-
+    // Refresh cached auth details (called on warmup, not on mic click)
+    const refreshAuth = useCallback(async () => {
         const account = await refreshRowboatAccount();
         if (
             account?.signedIn &&
             account.accessToken &&
             account.config?.websocketApiUrl
         ) {
-            const listenUrl = buildDeepgramListenUrl(account.config.websocketApiUrl, DEEPGRAM_PARAMS);
-            ws = new WebSocket(listenUrl, ['bearer', account.accessToken]);
+            cachedAuth = { type: 'rowboat', url: account.config.websocketApiUrl, token: account.accessToken };
         } else {
-            // Fall back to local API key (passed as subprotocol)
             const config = await window.ipc.invoke('voice:getConfig', null);
-            if (!config?.deepgram) return;
-            ws = new WebSocket(DEEPGRAM_LISTEN_URL, ['token', config.deepgram.apiKey]);
+            if (config?.deepgram) {
+                cachedAuth = { type: 'local', apiKey: config.deepgram.apiKey };
+            }
+        }
+    }, [refreshRowboatAccount]);
+
+    // Create and connect a Deepgram WebSocket using cached auth.
+    // Starts the connection and returns immediately (does not wait for open).
+    const connectWs = useCallback(async () => {
+        if (wsRef.current && (wsRef.current.readyState === WebSocket.OPEN || wsRef.current.readyState === WebSocket.CONNECTING)) return;
+
+        // Refresh auth if we don't have it cached yet
+        if (!cachedAuth) {
+            await refreshAuth();
+        }
+        if (!cachedAuth) return;
+
+        let ws: WebSocket;
+        if (cachedAuth.type === 'rowboat') {
+            const listenUrl = buildDeepgramListenUrl(cachedAuth.url, DEEPGRAM_PARAMS);
+            ws = new WebSocket(listenUrl, ['bearer', cachedAuth.token]);
+        } else {
+            ws = new WebSocket(DEEPGRAM_LISTEN_URL, ['token', cachedAuth.apiKey]);
         }
         wsRef.current = ws;
 
         ws.onopen = () => {
             console.log('[voice] WebSocket connected');
+            // Flush any buffered audio captured while we were connecting
+            const buffered = audioBufferRef.current;
+            audioBufferRef.current = [];
+            for (const chunk of buffered) {
+                ws.send(chunk);
+            }
         };
 
         ws.onmessage = (event) => {
@@ -73,13 +98,15 @@ export function useVoiceMode() {
 
         ws.onerror = () => {
             console.error('[voice] WebSocket error');
+            // Auth may be stale — clear cache so next attempt refreshes
+            cachedAuth = null;
         };
 
         ws.onclose = () => {
             console.log('[voice] WebSocket closed');
             wsRef.current = null;
         };
-    }, [refreshRowboatAccount]);
+    }, [refreshAuth]);
 
     // Stop audio capture and close WS
     const stopAudioCapture = useCallback(() => {
@@ -100,6 +127,7 @@ export function useVoiceMode() {
             wsRef.current.close();
             wsRef.current = null;
         }
+        audioBufferRef.current = [];
         setInterimText('');
         transcriptBufferRef.current = '';
         interimRef.current = '';
@@ -112,45 +140,28 @@ export function useVoiceMode() {
         transcriptBufferRef.current = '';
         interimRef.current = '';
         setInterimText('');
+        audioBufferRef.current = [];
 
-        // If WS isn't connected, connect and wait for it
-        if (!wsRef.current || wsRef.current.readyState !== WebSocket.OPEN) {
-            setState('connecting');
-            connectWs();
-            // Wait for WS to be ready (up to 5 seconds)
-            const wsOk = await new Promise<boolean>((resolve) => {
-                const checkInterval = setInterval(() => {
-                    if (wsRef.current?.readyState === WebSocket.OPEN) {
-                        clearInterval(checkInterval);
-                        resolve(true);
-                    }
-                }, 50);
-                setTimeout(() => {
-                    clearInterval(checkInterval);
-                    resolve(false);
-                }, 5000);
-            });
-            if (!wsOk) {
-                setState('idle');
-                return;
-            }
-        }
-
+        // Show listening immediately — don't wait for WebSocket
         setState('listening');
 
-        // Start mic
-        let stream: MediaStream | null = null;
-        try {
-            stream = await navigator.mediaDevices.getUserMedia({ audio: true });
-        } catch (err) {
-            console.error('Microphone access denied:', err);
+        // Kick off mic + WebSocket in parallel, don't await WebSocket
+        const [stream] = await Promise.all([
+            navigator.mediaDevices.getUserMedia({ audio: true }).catch((err) => {
+                console.error('Microphone access denied:', err);
+                return null;
+            }),
+            connectWs(),
+        ]);
+
+        if (!stream) {
             setState('idle');
             return;
         }
 
         mediaStreamRef.current = stream;
 
-        // Start audio capture
+        // Start audio capture immediately — buffer if WS isn't open yet
         const audioCtx = new AudioContext({ sampleRate: 16000 });
         audioCtxRef.current = audioCtx;
         const source = audioCtx.createMediaStreamSource(stream);
@@ -158,14 +169,19 @@ export function useVoiceMode() {
         processorRef.current = processor;
 
         processor.onaudioprocess = (e) => {
-            if (!wsRef.current || wsRef.current.readyState !== WebSocket.OPEN) return;
             const float32 = e.inputBuffer.getChannelData(0);
             const int16 = new Int16Array(float32.length);
             for (let i = 0; i < float32.length; i++) {
                 const s = Math.max(-1, Math.min(1, float32[i]));
                 int16[i] = s < 0 ? s * 0x8000 : s * 0x7fff;
             }
-            wsRef.current.send(int16.buffer);
+            const buffer = int16.buffer;
+            if (wsRef.current?.readyState === WebSocket.OPEN) {
+                wsRef.current.send(buffer);
+            } else {
+                // WebSocket still connecting — buffer the audio
+                audioBufferRef.current.push(buffer);
+            }
         };
 
         source.connect(processor);
@@ -188,5 +204,10 @@ export function useVoiceMode() {
         stopAudioCapture();
     }, [stopAudioCapture]);
 
-    return { state, interimText, start, submit, cancel };
+    /** Pre-cache auth details so mic click skips IPC round-trips */
+    const warmup = useCallback(() => {
+        refreshAuth().catch(() => {});
+    }, [refreshAuth]);
+
+    return { state, interimText, start, submit, cancel, warmup };
 }