diff --git a/apps/x/apps/renderer/src/hooks/useVoiceTTS.ts b/apps/x/apps/renderer/src/hooks/useVoiceTTS.ts index 5773f6de..c0737bf8 100644 --- a/apps/x/apps/renderer/src/hooks/useVoiceTTS.ts +++ b/apps/x/apps/renderer/src/hooks/useVoiceTTS.ts @@ -2,11 +2,46 @@ import { useCallback, useRef, useState } from 'react'; export type TTSState = 'idle' | 'synthesizing' | 'speaking'; +interface SynthesizedAudio { + dataUrl: string; +} + +function synthesize(text: string): Promise { + return window.ipc.invoke('voice:synthesize', { text }).then( + (result: { audioBase64: string; mimeType: string }) => ({ + dataUrl: `data:${result.mimeType};base64,${result.audioBase64}`, + }) + ); +} + +function playAudio(dataUrl: string, audioRef: React.MutableRefObject): Promise { + return new Promise((resolve, reject) => { + const audio = new Audio(dataUrl); + audioRef.current = audio; + audio.onended = () => { + console.log('[tts] audio ended'); + resolve(); + }; + audio.onerror = (e) => { + console.error('[tts] audio error:', e); + reject(new Error('Audio playback failed')); + }; + audio.play().then(() => { + console.log('[tts] audio playing'); + }).catch((err) => { + console.error('[tts] play() rejected:', err); + reject(err); + }); + }); +} + export function useVoiceTTS() { const [state, setState] = useState('idle'); const audioRef = useRef(null); const queueRef = useRef([]); const processingRef = useRef(false); + // Pre-fetched audio ready to play immediately + const prefetchedRef = useRef | null>(null); const processQueue = useCallback(async () => { if (processingRef.current) return; @@ -16,38 +51,38 @@ export function useVoiceTTS() { const text = queueRef.current.shift()!; if (!text.trim()) continue; - setState('synthesizing'); - console.log('[tts] synthesizing:', text.substring(0, 80)); try { - const result = await window.ipc.invoke('voice:synthesize', { text }); - console.log('[tts] got audio, mimeType:', result.mimeType, 'base64 length:', result.audioBase64.length); + // Use pre-fetched result if available, otherwise synthesize now + let audioPromise: Promise; + if (prefetchedRef.current) { + console.log('[tts] using pre-fetched audio'); + audioPromise = prefetchedRef.current; + prefetchedRef.current = null; + } else { + setState('synthesizing'); + console.log('[tts] synthesizing:', text.substring(0, 80)); + audioPromise = synthesize(text); + } + + const audio = await audioPromise; setState('speaking'); - await new Promise((resolve, reject) => { - const dataUrl = `data:${result.mimeType};base64,${result.audioBase64}`; - const audio = new Audio(dataUrl); - audioRef.current = audio; - audio.onended = () => { - console.log('[tts] audio ended'); - resolve(); - }; - audio.onerror = (e) => { - console.error('[tts] audio error:', e); - reject(new Error('Audio playback failed')); - }; - audio.play().then(() => { - console.log('[tts] audio playing'); - }).catch((err) => { - console.error('[tts] play() rejected:', err); - reject(err); - }); - }); + // Kick off pre-fetch for next chunk while this one plays + const nextText = queueRef.current[0]; + if (nextText?.trim()) { + console.log('[tts] pre-fetching next:', nextText.substring(0, 80)); + prefetchedRef.current = synthesize(nextText); + } + + await playAudio(audio.dataUrl, audioRef); } catch (err) { console.error('[tts] error:', err); + prefetchedRef.current = null; } } audioRef.current = null; + prefetchedRef.current = null; processingRef.current = false; setState('idle'); }, []); @@ -60,6 +95,7 @@ export function useVoiceTTS() { const cancel = useCallback(() => { queueRef.current = []; + prefetchedRef.current = null; if (audioRef.current) { audioRef.current.pause(); audioRef.current = null; diff --git a/apps/x/packages/core/src/voice/voice.ts b/apps/x/packages/core/src/voice/voice.ts index b0a6e628..9d46c1fc 100644 --- a/apps/x/packages/core/src/voice/voice.ts +++ b/apps/x/packages/core/src/voice/voice.ts @@ -49,7 +49,7 @@ export async function synthesizeSpeech(text: string): Promise<{ audioBase64: str }, body: JSON.stringify({ text, - model_id: 'eleven_multilingual_v2', + model_id: 'eleven_flash_v2_5', voice_settings: { stability: 0.5, similarity_boost: 0.75,