make voice input faster

This commit is contained in:
Arjun 2026-03-26 22:23:02 +05:30
parent e937fa01ea
commit 4f95ca91a6
2 changed files with 74 additions and 48 deletions

View file

@ -697,13 +697,18 @@ function App() {
window.ipc.invoke('oauth:getState', null),
]).then(([config, oauthState]) => {
const rowboatConnected = oauthState.config?.rowboat?.connected ?? false
setVoiceAvailable(!!config.deepgram || rowboatConnected)
const hasVoice = !!config.deepgram || rowboatConnected
setVoiceAvailable(hasVoice)
setTtsAvailable(!!config.elevenlabs || rowboatConnected)
// Pre-cache auth details so mic click skips IPC round-trips
if (hasVoice) {
voice.warmup()
}
}).catch(() => {
setVoiceAvailable(false)
setTtsAvailable(false)
})
}, [])
}, [voice])
useEffect(() => {
refreshVoiceAvailability()

View file

@ -16,6 +16,9 @@ const DEEPGRAM_PARAMS = new URLSearchParams({
});
const DEEPGRAM_LISTEN_URL = `wss://api.deepgram.com/v1/listen?${DEEPGRAM_PARAMS.toString()}`;
// Cache auth details so we don't need IPC round-trips on every mic click
let cachedAuth: { type: 'rowboat'; url: string; token: string } | { type: 'local'; apiKey: string } | null = null;
export function useVoiceMode() {
const { refresh: refreshRowboatAccount } = useRowboatAccount();
const [state, setState] = useState<VoiceState>('idle');
@ -26,32 +29,54 @@ export function useVoiceMode() {
const audioCtxRef = useRef<AudioContext | null>(null);
const transcriptBufferRef = useRef('');
const interimRef = useRef('');
// Buffer audio chunks captured before the WebSocket is ready
const audioBufferRef = useRef<ArrayBuffer[]>([]);
// Connect (or reconnect) the Deepgram WebSocket.
// Refreshes Rowboat account before connect so access token is current.
const connectWs = useCallback(async () => {
if (wsRef.current && (wsRef.current.readyState === WebSocket.OPEN || wsRef.current.readyState === WebSocket.CONNECTING)) return;
let ws: WebSocket;
// Refresh cached auth details (called on warmup, not on mic click)
const refreshAuth = useCallback(async () => {
const account = await refreshRowboatAccount();
if (
account?.signedIn &&
account.accessToken &&
account.config?.websocketApiUrl
) {
const listenUrl = buildDeepgramListenUrl(account.config.websocketApiUrl, DEEPGRAM_PARAMS);
ws = new WebSocket(listenUrl, ['bearer', account.accessToken]);
cachedAuth = { type: 'rowboat', url: account.config.websocketApiUrl, token: account.accessToken };
} else {
// Fall back to local API key (passed as subprotocol)
const config = await window.ipc.invoke('voice:getConfig', null);
if (!config?.deepgram) return;
ws = new WebSocket(DEEPGRAM_LISTEN_URL, ['token', config.deepgram.apiKey]);
if (config?.deepgram) {
cachedAuth = { type: 'local', apiKey: config.deepgram.apiKey };
}
}
}, [refreshRowboatAccount]);
// Create and connect a Deepgram WebSocket using cached auth.
// Starts the connection and returns immediately (does not wait for open).
const connectWs = useCallback(async () => {
if (wsRef.current && (wsRef.current.readyState === WebSocket.OPEN || wsRef.current.readyState === WebSocket.CONNECTING)) return;
// Refresh auth if we don't have it cached yet
if (!cachedAuth) {
await refreshAuth();
}
if (!cachedAuth) return;
let ws: WebSocket;
if (cachedAuth.type === 'rowboat') {
const listenUrl = buildDeepgramListenUrl(cachedAuth.url, DEEPGRAM_PARAMS);
ws = new WebSocket(listenUrl, ['bearer', cachedAuth.token]);
} else {
ws = new WebSocket(DEEPGRAM_LISTEN_URL, ['token', cachedAuth.apiKey]);
}
wsRef.current = ws;
ws.onopen = () => {
console.log('[voice] WebSocket connected');
// Flush any buffered audio captured while we were connecting
const buffered = audioBufferRef.current;
audioBufferRef.current = [];
for (const chunk of buffered) {
ws.send(chunk);
}
};
ws.onmessage = (event) => {
@ -73,13 +98,15 @@ export function useVoiceMode() {
ws.onerror = () => {
console.error('[voice] WebSocket error');
// Auth may be stale — clear cache so next attempt refreshes
cachedAuth = null;
};
ws.onclose = () => {
console.log('[voice] WebSocket closed');
wsRef.current = null;
};
}, [refreshRowboatAccount]);
}, [refreshAuth]);
// Stop audio capture and close WS
const stopAudioCapture = useCallback(() => {
@ -100,6 +127,7 @@ export function useVoiceMode() {
wsRef.current.close();
wsRef.current = null;
}
audioBufferRef.current = [];
setInterimText('');
transcriptBufferRef.current = '';
interimRef.current = '';
@ -112,45 +140,28 @@ export function useVoiceMode() {
transcriptBufferRef.current = '';
interimRef.current = '';
setInterimText('');
audioBufferRef.current = [];
// If WS isn't connected, connect and wait for it
if (!wsRef.current || wsRef.current.readyState !== WebSocket.OPEN) {
setState('connecting');
connectWs();
// Wait for WS to be ready (up to 5 seconds)
const wsOk = await new Promise<boolean>((resolve) => {
const checkInterval = setInterval(() => {
if (wsRef.current?.readyState === WebSocket.OPEN) {
clearInterval(checkInterval);
resolve(true);
}
}, 50);
setTimeout(() => {
clearInterval(checkInterval);
resolve(false);
}, 5000);
});
if (!wsOk) {
setState('idle');
return;
}
}
// Show listening immediately — don't wait for WebSocket
setState('listening');
// Start mic
let stream: MediaStream | null = null;
try {
stream = await navigator.mediaDevices.getUserMedia({ audio: true });
} catch (err) {
console.error('Microphone access denied:', err);
// Kick off mic + WebSocket in parallel, don't await WebSocket
const [stream] = await Promise.all([
navigator.mediaDevices.getUserMedia({ audio: true }).catch((err) => {
console.error('Microphone access denied:', err);
return null;
}),
connectWs(),
]);
if (!stream) {
setState('idle');
return;
}
mediaStreamRef.current = stream;
// Start audio capture
// Start audio capture immediately — buffer if WS isn't open yet
const audioCtx = new AudioContext({ sampleRate: 16000 });
audioCtxRef.current = audioCtx;
const source = audioCtx.createMediaStreamSource(stream);
@ -158,14 +169,19 @@ export function useVoiceMode() {
processorRef.current = processor;
processor.onaudioprocess = (e) => {
if (!wsRef.current || wsRef.current.readyState !== WebSocket.OPEN) return;
const float32 = e.inputBuffer.getChannelData(0);
const int16 = new Int16Array(float32.length);
for (let i = 0; i < float32.length; i++) {
const s = Math.max(-1, Math.min(1, float32[i]));
int16[i] = s < 0 ? s * 0x8000 : s * 0x7fff;
}
wsRef.current.send(int16.buffer);
const buffer = int16.buffer;
if (wsRef.current?.readyState === WebSocket.OPEN) {
wsRef.current.send(buffer);
} else {
// WebSocket still connecting — buffer the audio
audioBufferRef.current.push(buffer);
}
};
source.connect(processor);
@ -188,5 +204,10 @@ export function useVoiceMode() {
stopAudioCapture();
}, [stopAudioCapture]);
return { state, interimText, start, submit, cancel };
/** Pre-cache auth details so mic click skips IPC round-trips */
const warmup = useCallback(() => {
refreshAuth().catch(() => {});
}, [refreshAuth]);
return { state, interimText, start, submit, cancel, warmup };
}