mirror of
https://github.com/rowboatlabs/rowboat.git
synced 2026-05-06 13:52:44 +02:00
make voice input faster
This commit is contained in:
parent
eb34873c32
commit
473ffa5d4a
2 changed files with 74 additions and 48 deletions
|
|
@ -703,13 +703,18 @@ function App() {
|
||||||
window.ipc.invoke('oauth:getState', null),
|
window.ipc.invoke('oauth:getState', null),
|
||||||
]).then(([config, oauthState]) => {
|
]).then(([config, oauthState]) => {
|
||||||
const rowboatConnected = oauthState.config?.rowboat?.connected ?? false
|
const rowboatConnected = oauthState.config?.rowboat?.connected ?? false
|
||||||
setVoiceAvailable(!!config.deepgram || rowboatConnected)
|
const hasVoice = !!config.deepgram || rowboatConnected
|
||||||
|
setVoiceAvailable(hasVoice)
|
||||||
setTtsAvailable(!!config.elevenlabs || rowboatConnected)
|
setTtsAvailable(!!config.elevenlabs || rowboatConnected)
|
||||||
|
// Pre-cache auth details so mic click skips IPC round-trips
|
||||||
|
if (hasVoice) {
|
||||||
|
voice.warmup()
|
||||||
|
}
|
||||||
}).catch(() => {
|
}).catch(() => {
|
||||||
setVoiceAvailable(false)
|
setVoiceAvailable(false)
|
||||||
setTtsAvailable(false)
|
setTtsAvailable(false)
|
||||||
})
|
})
|
||||||
}, [])
|
}, [voice])
|
||||||
|
|
||||||
useEffect(() => {
|
useEffect(() => {
|
||||||
refreshVoiceAvailability()
|
refreshVoiceAvailability()
|
||||||
|
|
|
||||||
|
|
@ -16,6 +16,9 @@ const DEEPGRAM_PARAMS = new URLSearchParams({
|
||||||
});
|
});
|
||||||
const DEEPGRAM_LISTEN_URL = `wss://api.deepgram.com/v1/listen?${DEEPGRAM_PARAMS.toString()}`;
|
const DEEPGRAM_LISTEN_URL = `wss://api.deepgram.com/v1/listen?${DEEPGRAM_PARAMS.toString()}`;
|
||||||
|
|
||||||
|
// Cache auth details so we don't need IPC round-trips on every mic click
|
||||||
|
let cachedAuth: { type: 'rowboat'; url: string; token: string } | { type: 'local'; apiKey: string } | null = null;
|
||||||
|
|
||||||
export function useVoiceMode() {
|
export function useVoiceMode() {
|
||||||
const { refresh: refreshRowboatAccount } = useRowboatAccount();
|
const { refresh: refreshRowboatAccount } = useRowboatAccount();
|
||||||
const [state, setState] = useState<VoiceState>('idle');
|
const [state, setState] = useState<VoiceState>('idle');
|
||||||
|
|
@ -26,32 +29,54 @@ export function useVoiceMode() {
|
||||||
const audioCtxRef = useRef<AudioContext | null>(null);
|
const audioCtxRef = useRef<AudioContext | null>(null);
|
||||||
const transcriptBufferRef = useRef('');
|
const transcriptBufferRef = useRef('');
|
||||||
const interimRef = useRef('');
|
const interimRef = useRef('');
|
||||||
|
// Buffer audio chunks captured before the WebSocket is ready
|
||||||
|
const audioBufferRef = useRef<ArrayBuffer[]>([]);
|
||||||
|
|
||||||
// Connect (or reconnect) the Deepgram WebSocket.
|
// Refresh cached auth details (called on warmup, not on mic click)
|
||||||
// Refreshes Rowboat account before connect so access token is current.
|
const refreshAuth = useCallback(async () => {
|
||||||
const connectWs = useCallback(async () => {
|
|
||||||
if (wsRef.current && (wsRef.current.readyState === WebSocket.OPEN || wsRef.current.readyState === WebSocket.CONNECTING)) return;
|
|
||||||
|
|
||||||
let ws: WebSocket;
|
|
||||||
|
|
||||||
const account = await refreshRowboatAccount();
|
const account = await refreshRowboatAccount();
|
||||||
if (
|
if (
|
||||||
account?.signedIn &&
|
account?.signedIn &&
|
||||||
account.accessToken &&
|
account.accessToken &&
|
||||||
account.config?.websocketApiUrl
|
account.config?.websocketApiUrl
|
||||||
) {
|
) {
|
||||||
const listenUrl = buildDeepgramListenUrl(account.config.websocketApiUrl, DEEPGRAM_PARAMS);
|
cachedAuth = { type: 'rowboat', url: account.config.websocketApiUrl, token: account.accessToken };
|
||||||
ws = new WebSocket(listenUrl, ['bearer', account.accessToken]);
|
|
||||||
} else {
|
} else {
|
||||||
// Fall back to local API key (passed as subprotocol)
|
|
||||||
const config = await window.ipc.invoke('voice:getConfig', null);
|
const config = await window.ipc.invoke('voice:getConfig', null);
|
||||||
if (!config?.deepgram) return;
|
if (config?.deepgram) {
|
||||||
ws = new WebSocket(DEEPGRAM_LISTEN_URL, ['token', config.deepgram.apiKey]);
|
cachedAuth = { type: 'local', apiKey: config.deepgram.apiKey };
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}, [refreshRowboatAccount]);
|
||||||
|
|
||||||
|
// Create and connect a Deepgram WebSocket using cached auth.
|
||||||
|
// Starts the connection and returns immediately (does not wait for open).
|
||||||
|
const connectWs = useCallback(async () => {
|
||||||
|
if (wsRef.current && (wsRef.current.readyState === WebSocket.OPEN || wsRef.current.readyState === WebSocket.CONNECTING)) return;
|
||||||
|
|
||||||
|
// Refresh auth if we don't have it cached yet
|
||||||
|
if (!cachedAuth) {
|
||||||
|
await refreshAuth();
|
||||||
|
}
|
||||||
|
if (!cachedAuth) return;
|
||||||
|
|
||||||
|
let ws: WebSocket;
|
||||||
|
if (cachedAuth.type === 'rowboat') {
|
||||||
|
const listenUrl = buildDeepgramListenUrl(cachedAuth.url, DEEPGRAM_PARAMS);
|
||||||
|
ws = new WebSocket(listenUrl, ['bearer', cachedAuth.token]);
|
||||||
|
} else {
|
||||||
|
ws = new WebSocket(DEEPGRAM_LISTEN_URL, ['token', cachedAuth.apiKey]);
|
||||||
}
|
}
|
||||||
wsRef.current = ws;
|
wsRef.current = ws;
|
||||||
|
|
||||||
ws.onopen = () => {
|
ws.onopen = () => {
|
||||||
console.log('[voice] WebSocket connected');
|
console.log('[voice] WebSocket connected');
|
||||||
|
// Flush any buffered audio captured while we were connecting
|
||||||
|
const buffered = audioBufferRef.current;
|
||||||
|
audioBufferRef.current = [];
|
||||||
|
for (const chunk of buffered) {
|
||||||
|
ws.send(chunk);
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
ws.onmessage = (event) => {
|
ws.onmessage = (event) => {
|
||||||
|
|
@ -73,13 +98,15 @@ export function useVoiceMode() {
|
||||||
|
|
||||||
ws.onerror = () => {
|
ws.onerror = () => {
|
||||||
console.error('[voice] WebSocket error');
|
console.error('[voice] WebSocket error');
|
||||||
|
// Auth may be stale — clear cache so next attempt refreshes
|
||||||
|
cachedAuth = null;
|
||||||
};
|
};
|
||||||
|
|
||||||
ws.onclose = () => {
|
ws.onclose = () => {
|
||||||
console.log('[voice] WebSocket closed');
|
console.log('[voice] WebSocket closed');
|
||||||
wsRef.current = null;
|
wsRef.current = null;
|
||||||
};
|
};
|
||||||
}, [refreshRowboatAccount]);
|
}, [refreshAuth]);
|
||||||
|
|
||||||
// Stop audio capture and close WS
|
// Stop audio capture and close WS
|
||||||
const stopAudioCapture = useCallback(() => {
|
const stopAudioCapture = useCallback(() => {
|
||||||
|
|
@ -100,6 +127,7 @@ export function useVoiceMode() {
|
||||||
wsRef.current.close();
|
wsRef.current.close();
|
||||||
wsRef.current = null;
|
wsRef.current = null;
|
||||||
}
|
}
|
||||||
|
audioBufferRef.current = [];
|
||||||
setInterimText('');
|
setInterimText('');
|
||||||
transcriptBufferRef.current = '';
|
transcriptBufferRef.current = '';
|
||||||
interimRef.current = '';
|
interimRef.current = '';
|
||||||
|
|
@ -112,45 +140,28 @@ export function useVoiceMode() {
|
||||||
transcriptBufferRef.current = '';
|
transcriptBufferRef.current = '';
|
||||||
interimRef.current = '';
|
interimRef.current = '';
|
||||||
setInterimText('');
|
setInterimText('');
|
||||||
|
audioBufferRef.current = [];
|
||||||
|
|
||||||
// If WS isn't connected, connect and wait for it
|
// Show listening immediately — don't wait for WebSocket
|
||||||
if (!wsRef.current || wsRef.current.readyState !== WebSocket.OPEN) {
|
|
||||||
setState('connecting');
|
|
||||||
connectWs();
|
|
||||||
// Wait for WS to be ready (up to 5 seconds)
|
|
||||||
const wsOk = await new Promise<boolean>((resolve) => {
|
|
||||||
const checkInterval = setInterval(() => {
|
|
||||||
if (wsRef.current?.readyState === WebSocket.OPEN) {
|
|
||||||
clearInterval(checkInterval);
|
|
||||||
resolve(true);
|
|
||||||
}
|
|
||||||
}, 50);
|
|
||||||
setTimeout(() => {
|
|
||||||
clearInterval(checkInterval);
|
|
||||||
resolve(false);
|
|
||||||
}, 5000);
|
|
||||||
});
|
|
||||||
if (!wsOk) {
|
|
||||||
setState('idle');
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
setState('listening');
|
setState('listening');
|
||||||
|
|
||||||
// Start mic
|
// Kick off mic + WebSocket in parallel, don't await WebSocket
|
||||||
let stream: MediaStream | null = null;
|
const [stream] = await Promise.all([
|
||||||
try {
|
navigator.mediaDevices.getUserMedia({ audio: true }).catch((err) => {
|
||||||
stream = await navigator.mediaDevices.getUserMedia({ audio: true });
|
console.error('Microphone access denied:', err);
|
||||||
} catch (err) {
|
return null;
|
||||||
console.error('Microphone access denied:', err);
|
}),
|
||||||
|
connectWs(),
|
||||||
|
]);
|
||||||
|
|
||||||
|
if (!stream) {
|
||||||
setState('idle');
|
setState('idle');
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
mediaStreamRef.current = stream;
|
mediaStreamRef.current = stream;
|
||||||
|
|
||||||
// Start audio capture
|
// Start audio capture immediately — buffer if WS isn't open yet
|
||||||
const audioCtx = new AudioContext({ sampleRate: 16000 });
|
const audioCtx = new AudioContext({ sampleRate: 16000 });
|
||||||
audioCtxRef.current = audioCtx;
|
audioCtxRef.current = audioCtx;
|
||||||
const source = audioCtx.createMediaStreamSource(stream);
|
const source = audioCtx.createMediaStreamSource(stream);
|
||||||
|
|
@ -158,14 +169,19 @@ export function useVoiceMode() {
|
||||||
processorRef.current = processor;
|
processorRef.current = processor;
|
||||||
|
|
||||||
processor.onaudioprocess = (e) => {
|
processor.onaudioprocess = (e) => {
|
||||||
if (!wsRef.current || wsRef.current.readyState !== WebSocket.OPEN) return;
|
|
||||||
const float32 = e.inputBuffer.getChannelData(0);
|
const float32 = e.inputBuffer.getChannelData(0);
|
||||||
const int16 = new Int16Array(float32.length);
|
const int16 = new Int16Array(float32.length);
|
||||||
for (let i = 0; i < float32.length; i++) {
|
for (let i = 0; i < float32.length; i++) {
|
||||||
const s = Math.max(-1, Math.min(1, float32[i]));
|
const s = Math.max(-1, Math.min(1, float32[i]));
|
||||||
int16[i] = s < 0 ? s * 0x8000 : s * 0x7fff;
|
int16[i] = s < 0 ? s * 0x8000 : s * 0x7fff;
|
||||||
}
|
}
|
||||||
wsRef.current.send(int16.buffer);
|
const buffer = int16.buffer;
|
||||||
|
if (wsRef.current?.readyState === WebSocket.OPEN) {
|
||||||
|
wsRef.current.send(buffer);
|
||||||
|
} else {
|
||||||
|
// WebSocket still connecting — buffer the audio
|
||||||
|
audioBufferRef.current.push(buffer);
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
source.connect(processor);
|
source.connect(processor);
|
||||||
|
|
@ -188,5 +204,10 @@ export function useVoiceMode() {
|
||||||
stopAudioCapture();
|
stopAudioCapture();
|
||||||
}, [stopAudioCapture]);
|
}, [stopAudioCapture]);
|
||||||
|
|
||||||
return { state, interimText, start, submit, cancel };
|
/** Pre-cache auth details so mic click skips IPC round-trips */
|
||||||
|
const warmup = useCallback(() => {
|
||||||
|
refreshAuth().catch(() => {});
|
||||||
|
}, [refreshAuth]);
|
||||||
|
|
||||||
|
return { state, interimText, start, submit, cancel, warmup };
|
||||||
}
|
}
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue