diff --git a/apps/x/apps/main/src/ipc.ts b/apps/x/apps/main/src/ipc.ts index 6823bd22..055394ce 100644 --- a/apps/x/apps/main/src/ipc.ts +++ b/apps/x/apps/main/src/ipc.ts @@ -38,7 +38,7 @@ import { IAgentScheduleRepo } from '@x/core/dist/agent-schedule/repo.js'; import { IAgentScheduleStateRepo } from '@x/core/dist/agent-schedule/state-repo.js'; import { triggerRun as triggerAgentScheduleRun } from '@x/core/dist/agent-schedule/runner.js'; import { search } from '@x/core/dist/search/search.js'; -import { versionHistory } from '@x/core'; +import { versionHistory, voice } from '@x/core'; import { classifySchedule } from '@x/core/dist/knowledge/inline_tasks.js'; type InvokeChannels = ipc.InvokeChannels; @@ -352,7 +352,7 @@ export function setupIpcHandlers() { return runsCore.createRun(args); }, 'runs:createMessage': async (_event, args) => { - return { messageId: await runsCore.createMessage(args.runId, args.message) }; + return { messageId: await runsCore.createMessage(args.runId, args.message, args.voiceInput, args.voiceOutput) }; }, 'runs:authorizePermission': async (_event, args) => { await runsCore.authorizePermission(args.runId, args.authorization); @@ -571,5 +571,11 @@ export function setupIpcHandlers() { const schedule = await classifySchedule(args.instruction); return { schedule }; }, + 'voice:getConfig': async () => { + return voice.getVoiceConfig(); + }, + 'voice:synthesize': async (_event, args) => { + return voice.synthesizeSpeech(args.text); + }, }); } diff --git a/apps/x/apps/main/src/main.ts b/apps/x/apps/main/src/main.ts index 08160a23..4dbe6117 100644 --- a/apps/x/apps/main/src/main.ts +++ b/apps/x/apps/main/src/main.ts @@ -1,4 +1,4 @@ -import { app, BrowserWindow, protocol, net, shell } from "electron"; +import { app, BrowserWindow, protocol, net, shell, session } from "electron"; import path from "node:path"; import { setupIpcHandlers, @@ -92,6 +92,15 @@ function createWindow() { }, }); + // Grant microphone permission for voice mode + session.defaultSession.setPermissionRequestHandler((_webContents, permission, callback) => { + if (permission === 'media') { + callback(true); + } else { + callback(false); + } + }); + // Show window when content is ready to prevent blank screen win.once("ready-to-show", () => { win.show(); diff --git a/apps/x/apps/renderer/src/App.tsx b/apps/x/apps/renderer/src/App.tsx index 10e68e02..eaa24749 100644 --- a/apps/x/apps/renderer/src/App.tsx +++ b/apps/x/apps/renderer/src/App.tsx @@ -76,6 +76,8 @@ import { import { AgentScheduleConfig } from '@x/shared/dist/agent-schedule.js' import { AgentScheduleState } from '@x/shared/dist/agent-schedule-state.js' import { toast } from "sonner" +import { useVoiceMode } from '@/hooks/useVoiceMode' +import { useVoiceTTS } from '@/hooks/useVoiceTTS' type DirEntry = z.infer type RunEventType = z.infer @@ -546,6 +548,87 @@ function App() { const [agentId] = useState('copilot') const [presetMessage, setPresetMessage] = useState(undefined) + // Voice mode state + const [voiceAvailable, setVoiceAvailable] = useState(false) + const [ttsAvailable, setTtsAvailable] = useState(false) + const [ttsEnabled, setTtsEnabled] = useState(false) + const ttsEnabledRef = useRef(false) + const [ttsMode, setTtsMode] = useState<'summary' | 'full'>('summary') + const ttsModeRef = useRef<'summary' | 'full'>('summary') + const [isRecording, setIsRecording] = useState(false) + const voiceTextBufferRef = useRef('') + const spokenIndexRef = useRef(0) + const isRecordingRef = useRef(false) + + const tts = useVoiceTTS() + const ttsRef = useRef(tts) + ttsRef.current = tts + + const voice = useVoiceMode() + const voiceRef = useRef(voice) + voiceRef.current = voice + + // Check if voice is available on mount + useEffect(() => { + window.ipc.invoke('voice:getConfig', null).then(config => { + setVoiceAvailable(!!config.deepgram) + setTtsAvailable(!!config.elevenlabs) + }).catch(() => { + setVoiceAvailable(false) + setTtsAvailable(false) + }) + }, []) + + const handleStartRecording = useCallback(() => { + setIsRecording(true) + isRecordingRef.current = true + voice.start() + }, [voice]) + + const handlePromptSubmitRef = useRef<((msg: { text: string }) => void) | null>(null) + const pendingVoiceInputRef = useRef(false) + + const handleSubmitRecording = useCallback(() => { + const text = voice.submit() + setIsRecording(false) + isRecordingRef.current = false + if (text) { + pendingVoiceInputRef.current = true + handlePromptSubmitRef.current?.({ text }) + } + }, [voice]) + + const handleToggleTts = useCallback(() => { + setTtsEnabled(prev => { + const next = !prev + ttsEnabledRef.current = next + if (!next) { + ttsRef.current.cancel() + } + return next + }) + }, []) + + const handleTtsModeChange = useCallback((mode: 'summary' | 'full') => { + setTtsMode(mode) + ttsModeRef.current = mode + }, []) + + const handleCancelRecording = useCallback(() => { + voice.cancel() + setIsRecording(false) + isRecordingRef.current = false + }, [voice]) + + // Helper to cancel recording from any navigation handler + const cancelRecordingIfActive = useCallback(() => { + if (isRecordingRef.current) { + voiceRef.current.cancel() + setIsRecording(false) + isRecordingRef.current = false + } + }, []) + // Runs history state type RunListItem = { id: string; title?: string; createdAt: string; agentId: string } const [runs, setRuns] = useState([]) @@ -1496,6 +1579,9 @@ function App() { if (!isActiveRun) return setIsProcessing(true) setModelUsage(null) + // Reset voice buffer for new response + voiceTextBufferRef.current = '' + spokenIndexRef.current = 0 break case 'run-processing-end': @@ -1545,6 +1631,20 @@ function App() { if (llmEvent.type === 'text-delta' && llmEvent.delta) { appendStreamingBuffer(event.runId, llmEvent.delta) setCurrentAssistantMessage(prev => prev + llmEvent.delta) + + // Extract tags and send to TTS when enabled + voiceTextBufferRef.current += llmEvent.delta + const remaining = voiceTextBufferRef.current.substring(spokenIndexRef.current) + const voiceRegex = /([\s\S]*?)<\/voice>/g + let voiceMatch: RegExpExecArray | null + while ((voiceMatch = voiceRegex.exec(remaining)) !== null) { + const voiceContent = voiceMatch[1].trim() + console.log('[voice] extracted voice tag:', voiceContent) + if (voiceContent && ttsEnabledRef.current) { + ttsRef.current.speak(voiceContent) + } + spokenIndexRef.current += voiceMatch.index + voiceMatch[0].length + } } else if (llmEvent.type === 'tool-call') { setConversation(prev => [...prev, { id: llmEvent.toolCallId || `tool-${Date.now()}`, @@ -1584,6 +1684,7 @@ function App() { if (msg.role === 'assistant') { setCurrentAssistantMessage(currentMsg => { if (currentMsg) { + const cleanedContent = currentMsg.replace(/<\/?voice>/g, '') setConversation(prev => { const exists = prev.some(m => m.id === event.messageId && 'role' in m && m.role === 'assistant' @@ -1592,7 +1693,7 @@ function App() { return [...prev, { id: event.messageId, role: 'assistant', - content: currentMsg, + content: cleanedContent, timestamp: Date.now(), }] }) @@ -1887,6 +1988,8 @@ function App() { await window.ipc.invoke('runs:createMessage', { runId: currentRunId, message: attachmentPayload, + voiceInput: pendingVoiceInputRef.current || undefined, + voiceOutput: ttsEnabledRef.current ? ttsModeRef.current : undefined, }) } else { // Legacy path: plain string with optional XML-formatted @mentions. @@ -1915,11 +2018,15 @@ function App() { await window.ipc.invoke('runs:createMessage', { runId: currentRunId, message: formattedMessage, + voiceInput: pendingVoiceInputRef.current || undefined, + voiceOutput: ttsEnabledRef.current ? ttsModeRef.current : undefined, }) titleSource = formattedMessage } + pendingVoiceInputRef.current = false + if (isNewRun) { const inferredTitle = inferRunTitleFromMessage(titleSource) setRuns((prev) => { @@ -1936,6 +2043,7 @@ function App() { console.error('Failed to send message:', error) } } + handlePromptSubmitRef.current = handlePromptSubmit const handleStop = useCallback(async () => { if (!runId) return @@ -2065,6 +2173,7 @@ function App() { }, []) const openChatInNewTab = useCallback((targetRunId: string) => { + cancelRecordingIfActive() const existingTab = chatTabs.find(t => t.runId === targetRunId) if (existingTab) { // Cancel stale in-flight loads from previously focused tabs. @@ -2080,12 +2189,18 @@ function App() { setChatTabs(prev => [...prev, { id, runId: targetRunId }]) setActiveChatTabId(id) loadRun(targetRunId) - }, [chatTabs, loadRun, restoreChatTabState]) + }, [chatTabs, loadRun, restoreChatTabState, cancelRecordingIfActive]) const switchChatTab = useCallback((tabId: string) => { const tab = chatTabs.find(t => t.id === tabId) if (!tab) return if (tabId === activeChatTabId) return + // Cancel any active recording when switching tabs + if (isRecordingRef.current) { + voiceRef.current.cancel() + setIsRecording(false) + isRecordingRef.current = false + } saveChatScrollForTab(activeChatTabId) // Cancel stale in-flight loads from previously focused tabs. loadRunRequestIdRef.current += 1 @@ -2471,13 +2586,14 @@ function App() { const current = currentViewState if (viewStatesEqual(current, nextView)) return + cancelRecordingIfActive() const nextHistory = { back: appendUnique(historyRef.current.back, current), forward: [] as ViewState[], } setHistory(nextHistory) await applyViewState(nextView) - }, [appendUnique, applyViewState, currentViewState, setHistory]) + }, [appendUnique, applyViewState, cancelRecordingIfActive, currentViewState, setHistory]) const navigateBack = useCallback(async () => { const { back, forward } = historyRef.current @@ -3412,6 +3528,7 @@ function App() { tasksActions={{ onNewChat: handleNewChatTab, onSelectRun: (runIdToLoad) => { + cancelRecordingIfActive() if (selectedPath || isGraphOpen) { setIsChatSidebarOpen(true) } @@ -3814,7 +3931,7 @@ function App() { {tabState.currentAssistantMessage && ( - {tabState.currentAssistantMessage} + {tabState.currentAssistantMessage.replace(/<\/?voice>/g, '')} )} @@ -3865,6 +3982,18 @@ function App() { runId={tabState.runId} initialDraft={chatDraftsRef.current.get(tab.id)} onDraftChange={(text) => setChatDraftForTab(tab.id, text)} + isRecording={isActive && isRecording} + recordingText={isActive ? voice.interimText : undefined} + recordingState={isActive ? (voice.state === 'connecting' ? 'connecting' : 'listening') : undefined} + onStartRecording={isActive ? handleStartRecording : undefined} + onSubmitRecording={isActive ? handleSubmitRecording : undefined} + onCancelRecording={isActive ? handleCancelRecording : undefined} + voiceAvailable={isActive && voiceAvailable} + ttsAvailable={isActive && ttsAvailable} + ttsEnabled={ttsEnabled} + ttsMode={ttsMode} + onToggleTts={isActive ? handleToggleTts : undefined} + onTtsModeChange={isActive ? handleTtsModeChange : undefined} /> ) @@ -3914,6 +4043,18 @@ function App() { onToolOpenChangeForTab={setToolOpenForTab} onOpenKnowledgeFile={(path) => { navigateToFile(path) }} onActivate={() => setActiveShortcutPane('right')} + isRecording={isRecording} + recordingText={voice.interimText} + recordingState={voice.state === 'connecting' ? 'connecting' : 'listening'} + onStartRecording={handleStartRecording} + onSubmitRecording={handleSubmitRecording} + onCancelRecording={handleCancelRecording} + voiceAvailable={voiceAvailable} + ttsAvailable={ttsAvailable} + ttsEnabled={ttsEnabled} + ttsMode={ttsMode} + onToggleTts={handleToggleTts} + onTtsModeChange={handleTtsModeChange} /> )} {/* Rendered last so its no-drag region paints over the sidebar drag region */} diff --git a/apps/x/apps/renderer/src/components/chat-input-with-mentions.tsx b/apps/x/apps/renderer/src/components/chat-input-with-mentions.tsx index 42ea45bb..e7db29d8 100644 --- a/apps/x/apps/renderer/src/components/chat-input-with-mentions.tsx +++ b/apps/x/apps/renderer/src/components/chat-input-with-mentions.tsx @@ -1,4 +1,5 @@ import { useCallback, useEffect, useRef, useState } from 'react' +import { Tooltip, TooltipContent, TooltipTrigger } from '@/components/ui/tooltip' import { ArrowUp, AudioLines, @@ -9,7 +10,9 @@ import { FileSpreadsheet, FileText, FileVideo, + Headphones, LoaderIcon, + Mic, Plus, Square, X, @@ -102,6 +105,18 @@ interface ChatInputInnerProps { runId?: string | null initialDraft?: string onDraftChange?: (text: string) => void + isRecording?: boolean + recordingText?: string + recordingState?: 'connecting' | 'listening' + onStartRecording?: () => void + onSubmitRecording?: () => void + onCancelRecording?: () => void + voiceAvailable?: boolean + ttsAvailable?: boolean + ttsEnabled?: boolean + ttsMode?: 'summary' | 'full' + onToggleTts?: () => void + onTtsModeChange?: (mode: 'summary' | 'full') => void } function ChatInputInner({ @@ -115,6 +130,18 @@ function ChatInputInner({ runId, initialDraft, onDraftChange, + isRecording, + recordingText, + recordingState, + onStartRecording, + onSubmitRecording, + onCancelRecording, + voiceAvailable, + ttsAvailable, + ttsEnabled, + ttsMode, + onToggleTts, + onTtsModeChange, }: ChatInputInnerProps) { const controller = usePromptInputController() const message = controller.textInput.value @@ -367,6 +394,40 @@ function ChatInputInner({ e.target.value = '' }} /> + {isRecording ? ( + /* ── Recording bar ── */ +
+ +
+ + + {recordingState === 'connecting' ? 'Connecting...' : recordingText || 'Listening...'} + +
+ +
+ ) : ( + /* ── Normal input ── */ + <>
)} + {onToggleTts && ttsAvailable && ( +
+ + + + + + {ttsEnabled ? 'Voice output on' : 'Voice output off'} + + + {ttsEnabled && onTtsModeChange && ( + + + + + + onTtsModeChange(v as 'summary' | 'full')}> + Speak summary + Speak full response + + + + )} +
+ )} + {voiceAvailable && onStartRecording && ( + + )} {isProcessing ? (
+ + )} + + ) +} + +/** Animated waveform bars for the recording indicator */ +function VoiceWaveform() { + return ( +
+ {[0, 1, 2, 3, 4].map((i) => ( + + ))} +
) } @@ -466,6 +609,18 @@ export interface ChatInputWithMentionsProps { runId?: string | null initialDraft?: string onDraftChange?: (text: string) => void + isRecording?: boolean + recordingText?: string + recordingState?: 'connecting' | 'listening' + onStartRecording?: () => void + onSubmitRecording?: () => void + onCancelRecording?: () => void + voiceAvailable?: boolean + ttsAvailable?: boolean + ttsEnabled?: boolean + ttsMode?: 'summary' | 'full' + onToggleTts?: () => void + onTtsModeChange?: (mode: 'summary' | 'full') => void } export function ChatInputWithMentions({ @@ -482,6 +637,18 @@ export function ChatInputWithMentions({ runId, initialDraft, onDraftChange, + isRecording, + recordingText, + recordingState, + onStartRecording, + onSubmitRecording, + onCancelRecording, + voiceAvailable, + ttsAvailable, + ttsEnabled, + ttsMode, + onToggleTts, + onTtsModeChange, }: ChatInputWithMentionsProps) { return ( @@ -496,6 +663,18 @@ export function ChatInputWithMentions({ runId={runId} initialDraft={initialDraft} onDraftChange={onDraftChange} + isRecording={isRecording} + recordingText={recordingText} + recordingState={recordingState} + onStartRecording={onStartRecording} + onSubmitRecording={onSubmitRecording} + onCancelRecording={onCancelRecording} + voiceAvailable={voiceAvailable} + ttsAvailable={ttsAvailable} + ttsEnabled={ttsEnabled} + ttsMode={ttsMode} + onToggleTts={onToggleTts} + onTtsModeChange={onTtsModeChange} /> ) diff --git a/apps/x/apps/renderer/src/components/chat-sidebar.tsx b/apps/x/apps/renderer/src/components/chat-sidebar.tsx index f020cdae..ac7f23be 100644 --- a/apps/x/apps/renderer/src/components/chat-sidebar.tsx +++ b/apps/x/apps/renderer/src/components/chat-sidebar.tsx @@ -108,6 +108,19 @@ interface ChatSidebarProps { onToolOpenChangeForTab?: (tabId: string, toolId: string, open: boolean) => void onOpenKnowledgeFile?: (path: string) => void onActivate?: () => void + // Voice / TTS props + isRecording?: boolean + recordingText?: string + recordingState?: 'connecting' | 'listening' + onStartRecording?: () => void + onSubmitRecording?: () => void + onCancelRecording?: () => void + voiceAvailable?: boolean + ttsAvailable?: boolean + ttsEnabled?: boolean + ttsMode?: 'summary' | 'full' + onToggleTts?: () => void + onTtsModeChange?: (mode: 'summary' | 'full') => void } export function ChatSidebar({ @@ -146,6 +159,18 @@ export function ChatSidebar({ onToolOpenChangeForTab, onOpenKnowledgeFile, onActivate, + isRecording, + recordingText, + recordingState, + onStartRecording, + onSubmitRecording, + onCancelRecording, + voiceAvailable, + ttsAvailable, + ttsEnabled, + ttsMode, + onToggleTts, + onTtsModeChange, }: ChatSidebarProps) { const [width, setWidth] = useState(() => getInitialPaneWidth(defaultWidth)) const [isResizing, setIsResizing] = useState(false) @@ -542,6 +567,18 @@ export function ChatSidebar({ runId={tabState.runId} initialDraft={getInitialDraft?.(tab.id)} onDraftChange={onDraftChangeForTab ? (text) => onDraftChangeForTab(tab.id, text) : undefined} + isRecording={isActive && isRecording} + recordingText={isActive ? recordingText : undefined} + recordingState={isActive ? recordingState : undefined} + onStartRecording={isActive ? onStartRecording : undefined} + onSubmitRecording={isActive ? onSubmitRecording : undefined} + onCancelRecording={isActive ? onCancelRecording : undefined} + voiceAvailable={isActive && voiceAvailable} + ttsAvailable={isActive && ttsAvailable} + ttsEnabled={ttsEnabled} + ttsMode={ttsMode} + onToggleTts={isActive ? onToggleTts : undefined} + onTtsModeChange={isActive ? onTtsModeChange : undefined} /> ) diff --git a/apps/x/apps/renderer/src/hooks/useVoiceMode.ts b/apps/x/apps/renderer/src/hooks/useVoiceMode.ts new file mode 100644 index 00000000..f30d7ff4 --- /dev/null +++ b/apps/x/apps/renderer/src/hooks/useVoiceMode.ts @@ -0,0 +1,218 @@ +import { useCallback, useEffect, useRef, useState } from 'react'; + +export type VoiceState = 'idle' | 'connecting' | 'listening'; + +// Cache the API key so we skip the IPC call after first use +let cachedApiKey: string | null = null; +let apiKeyFetched = false; + +export function useVoiceMode() { + const [state, setState] = useState('idle'); + const [interimText, setInterimText] = useState(''); + const wsRef = useRef(null); + const mediaStreamRef = useRef(null); + const processorRef = useRef(null); + const audioCtxRef = useRef(null); + const transcriptBufferRef = useRef(''); + const interimRef = useRef(''); + const reconnectTimerRef = useRef | null>(null); + const mountedRef = useRef(true); + + // Connect (or reconnect) the Deepgram WebSocket. + // The WS stays open while the hook is mounted; only audio capture starts/stops per recording. + const connectWs = useCallback(() => { + if (!cachedApiKey) return; + if (wsRef.current && (wsRef.current.readyState === WebSocket.OPEN || wsRef.current.readyState === WebSocket.CONNECTING)) return; + + const ws = new WebSocket( + `wss://api.deepgram.com/v1/listen?model=nova-3&encoding=linear16&sample_rate=16000&channels=1&interim_results=true&smart_format=true&punctuate=true&language=en`, + ['token', cachedApiKey] + ); + wsRef.current = ws; + + ws.onopen = () => { + console.log('[voice] WebSocket connected'); + }; + + ws.onmessage = (event) => { + const data = JSON.parse(event.data); + if (!data.channel?.alternatives?.[0]) return; + + const transcript = data.channel.alternatives[0].transcript; + if (!transcript) return; + + if (data.is_final) { + transcriptBufferRef.current += (transcriptBufferRef.current ? ' ' : '') + transcript; + interimRef.current = ''; + setInterimText(transcriptBufferRef.current); + } else { + interimRef.current = transcript; + setInterimText(transcriptBufferRef.current + (transcriptBufferRef.current ? ' ' : '') + transcript); + } + }; + + ws.onerror = () => { + console.error('[voice] WebSocket error'); + }; + + ws.onclose = () => { + console.log('[voice] WebSocket closed'); + wsRef.current = null; + // Auto-reconnect after 3 seconds if still mounted + if (mountedRef.current && cachedApiKey) { + reconnectTimerRef.current = setTimeout(() => { + if (mountedRef.current) connectWs(); + }, 3000); + } + }; + }, []); + + // Fetch API key on mount and establish persistent WebSocket + useEffect(() => { + mountedRef.current = true; + + const init = async () => { + if (!apiKeyFetched) { + apiKeyFetched = true; + try { + const config = await window.ipc.invoke('voice:getConfig', null); + cachedApiKey = config.deepgram?.apiKey ?? null; + } catch { /* ignore */ } + } + if (cachedApiKey && mountedRef.current) { + connectWs(); + } + }; + void init(); + + return () => { + mountedRef.current = false; + if (reconnectTimerRef.current) { + clearTimeout(reconnectTimerRef.current); + reconnectTimerRef.current = null; + } + // Close WS on unmount, suppress reconnect by nulling onclose + if (wsRef.current) { + wsRef.current.onclose = null; + wsRef.current.close(); + wsRef.current = null; + } + }; + }, [connectWs]); + + // Stop only audio capture (mic + processor), leaving WS open + const stopAudioCapture = useCallback(() => { + if (processorRef.current) { + processorRef.current.disconnect(); + processorRef.current = null; + } + if (audioCtxRef.current) { + audioCtxRef.current.close(); + audioCtxRef.current = null; + } + if (mediaStreamRef.current) { + mediaStreamRef.current.getTracks().forEach(t => t.stop()); + mediaStreamRef.current = null; + } + setInterimText(''); + transcriptBufferRef.current = ''; + interimRef.current = ''; + setState('idle'); + }, []); + + const start = useCallback(async () => { + if (state !== 'idle') return; + + // Ensure we have an API key + if (!cachedApiKey) { + try { + const config = await window.ipc.invoke('voice:getConfig', null); + cachedApiKey = config.deepgram?.apiKey ?? null; + } catch { /* ignore */ } + } + if (!cachedApiKey) { + console.error('Deepgram not configured'); + return; + } + + transcriptBufferRef.current = ''; + interimRef.current = ''; + setInterimText(''); + + // If WS isn't connected, connect and wait for it + if (!wsRef.current || wsRef.current.readyState !== WebSocket.OPEN) { + setState('connecting'); + connectWs(); + // Wait for WS to be ready (up to 5 seconds) + const wsOk = await new Promise((resolve) => { + const checkInterval = setInterval(() => { + if (wsRef.current?.readyState === WebSocket.OPEN) { + clearInterval(checkInterval); + resolve(true); + } + }, 50); + setTimeout(() => { + clearInterval(checkInterval); + resolve(false); + }, 5000); + }); + if (!wsOk) { + setState('idle'); + return; + } + } + + setState('listening'); + + // Start mic + let stream: MediaStream | null = null; + try { + stream = await navigator.mediaDevices.getUserMedia({ audio: true }); + } catch (err) { + console.error('Microphone access denied:', err); + setState('idle'); + return; + } + + mediaStreamRef.current = stream; + + // Start audio capture + const audioCtx = new AudioContext({ sampleRate: 16000 }); + audioCtxRef.current = audioCtx; + const source = audioCtx.createMediaStreamSource(stream); + const processor = audioCtx.createScriptProcessor(4096, 1, 1); + processorRef.current = processor; + + processor.onaudioprocess = (e) => { + if (!wsRef.current || wsRef.current.readyState !== WebSocket.OPEN) return; + const float32 = e.inputBuffer.getChannelData(0); + const int16 = new Int16Array(float32.length); + for (let i = 0; i < float32.length; i++) { + const s = Math.max(-1, Math.min(1, float32[i])); + int16[i] = s < 0 ? s * 0x8000 : s * 0x7fff; + } + wsRef.current.send(int16.buffer); + }; + + source.connect(processor); + processor.connect(audioCtx.destination); + }, [state, connectWs]); + + /** Stop recording and return the full transcript (finalized + any current interim) */ + const submit = useCallback((): string => { + let text = transcriptBufferRef.current; + if (interimRef.current) { + text += (text ? ' ' : '') + interimRef.current; + } + text = text.trim(); + stopAudioCapture(); + return text; + }, [stopAudioCapture]); + + /** Cancel recording without returning transcript */ + const cancel = useCallback(() => { + stopAudioCapture(); + }, [stopAudioCapture]); + + return { state, interimText, start, submit, cancel }; +} diff --git a/apps/x/apps/renderer/src/hooks/useVoiceTTS.ts b/apps/x/apps/renderer/src/hooks/useVoiceTTS.ts new file mode 100644 index 00000000..5773f6de --- /dev/null +++ b/apps/x/apps/renderer/src/hooks/useVoiceTTS.ts @@ -0,0 +1,72 @@ +import { useCallback, useRef, useState } from 'react'; + +export type TTSState = 'idle' | 'synthesizing' | 'speaking'; + +export function useVoiceTTS() { + const [state, setState] = useState('idle'); + const audioRef = useRef(null); + const queueRef = useRef([]); + const processingRef = useRef(false); + + const processQueue = useCallback(async () => { + if (processingRef.current) return; + processingRef.current = true; + + while (queueRef.current.length > 0) { + const text = queueRef.current.shift()!; + if (!text.trim()) continue; + + setState('synthesizing'); + console.log('[tts] synthesizing:', text.substring(0, 80)); + try { + const result = await window.ipc.invoke('voice:synthesize', { text }); + console.log('[tts] got audio, mimeType:', result.mimeType, 'base64 length:', result.audioBase64.length); + setState('speaking'); + + await new Promise((resolve, reject) => { + const dataUrl = `data:${result.mimeType};base64,${result.audioBase64}`; + const audio = new Audio(dataUrl); + audioRef.current = audio; + audio.onended = () => { + console.log('[tts] audio ended'); + resolve(); + }; + audio.onerror = (e) => { + console.error('[tts] audio error:', e); + reject(new Error('Audio playback failed')); + }; + audio.play().then(() => { + console.log('[tts] audio playing'); + }).catch((err) => { + console.error('[tts] play() rejected:', err); + reject(err); + }); + }); + } catch (err) { + console.error('[tts] error:', err); + } + } + + audioRef.current = null; + processingRef.current = false; + setState('idle'); + }, []); + + const speak = useCallback((text: string) => { + console.log('[tts] speak() called:', text.substring(0, 80)); + queueRef.current.push(text); + processQueue(); + }, [processQueue]); + + const cancel = useCallback(() => { + queueRef.current = []; + if (audioRef.current) { + audioRef.current.pause(); + audioRef.current = null; + } + processingRef.current = false; + setState('idle'); + }, []); + + return { state, speak, cancel }; +} diff --git a/apps/x/packages/core/src/agents/runtime.ts b/apps/x/packages/core/src/agents/runtime.ts index cd85e9be..0c15f610 100644 --- a/apps/x/packages/core/src/agents/runtime.ts +++ b/apps/x/packages/core/src/agents/runtime.ts @@ -894,11 +894,19 @@ export async function* streamAgent({ } // get any queued user messages + let voiceInput = false; + let voiceOutput: 'summary' | 'full' | null = null; while (true) { const msg = await messageQueue.dequeue(runId); if (!msg) { break; } + if (msg.voiceInput) { + voiceInput = true; + } + if (msg.voiceOutput) { + voiceOutput = msg.voiceOutput; + } loopLogger.log('dequeued user message', msg.messageId); yield* processEvent({ runId, @@ -938,7 +946,18 @@ export async function* streamAgent({ minute: '2-digit', timeZoneName: 'short' }); - const instructionsWithDateTime = `Current date and time: ${currentDateTime}\n\n${agent.instructions}`; + let instructionsWithDateTime = `Current date and time: ${currentDateTime}\n\n${agent.instructions}`; + if (voiceInput) { + loopLogger.log('voice input enabled, injecting voice input prompt'); + instructionsWithDateTime += `\n\n# Voice Input\nThe user's message was transcribed from speech. Be aware that:\n- There may be transcription errors. Silently correct obvious ones (e.g. homophones, misheard words). If an error is genuinely ambiguous, briefly mention your interpretation (e.g. "I'm assuming you meant X").\n- Spoken messages are often long-winded. The user may ramble, repeat themselves, or correct something they said earlier in the same message. Focus on their final intent, not every word verbatim.`; + } + if (voiceOutput === 'summary') { + loopLogger.log('voice output enabled (summary mode), injecting voice output prompt'); + instructionsWithDateTime += `\n\n# Voice Output (MANDATORY)\nThe user has voice output enabled. You MUST start your response with tags that provide a spoken summary and guide to your written response. This is NOT optional — every response MUST begin with tags.\n\nRules:\n1. ALWAYS start your response with one or more tags. Never skip them.\n2. Place ALL tags at the BEGINNING of your response, before any detailed content. Do NOT intersperse tags throughout the response.\n3. Wrap EACH spoken sentence in its own separate tag so it can be spoken incrementally. Do NOT wrap everything in a single block.\n4. Use voice as a TL;DR and navigation aid — do NOT read the entire response aloud.\n\nExample — if the user asks "what happened in my meeting with Sarah yesterday?":\nYour meeting with Sarah covered three main things: the Q2 roadmap timeline, hiring for the backend role, and the client demo next week.\nI've pulled out the key details and action items below — the demo prep notes are at the end.\n\n## Meeting with Sarah — March 11\n(Then the full detailed written response follows without any more tags.)\n\nAny text outside tags is shown visually but not spoken.`; + } else if (voiceOutput === 'full') { + loopLogger.log('voice output enabled (full mode), injecting voice output prompt'); + instructionsWithDateTime += `\n\n# Voice Output — Full Read-Aloud (MANDATORY)\nThe user wants your ENTIRE response spoken aloud. You MUST wrap your full response in tags. This is NOT optional.\n\nRules:\n1. Wrap EACH sentence in its own separate tag so it can be spoken incrementally.\n2. Write your response in a natural, conversational style suitable for listening — no markdown headings, bullet points, or formatting symbols. Use plain spoken language.\n3. Structure the content as if you are speaking to the user directly. Use transitions like "first", "also", "one more thing" instead of visual formatting.\n4. Every sentence MUST be inside a tag. Do not leave any content outside tags.\n\nExample:\nYour meeting with Sarah covered three main things.\nFirst, you discussed the Q2 roadmap timeline and agreed to push the launch to April.\nSecond, you talked about hiring for the backend role — Sarah will send over two candidates by Friday.\nAnd lastly, the client demo is next week on Thursday at 2pm, and you're handling the intro slides.`; + } let streamError: string | null = null; for await (const event of streamLlm( model, diff --git a/apps/x/packages/core/src/application/assistant/instructions.ts b/apps/x/packages/core/src/application/assistant/instructions.ts index 5dd0383b..978e6c40 100644 --- a/apps/x/packages/core/src/application/assistant/instructions.ts +++ b/apps/x/packages/core/src/application/assistant/instructions.ts @@ -33,6 +33,8 @@ Rowboat is an agentic assistant for everyday work - emails, meetings, projects, **Document Collaboration:** When users ask you to work on a document, collaborate on writing, create a new document, edit/refine existing notes, or say things like "let's work on [X]", "help me write [X]", "create a doc for [X]", or "let's draft [X]", you MUST load the \`doc-collab\` skill first. This is required for any document creation or editing task. The skill provides structured guidance for creating, editing, and refining documents in the knowledge base. +**App Control:** When users ask you to open notes, show the bases or graph view, filter or search notes, or manage saved views, load the \`app-navigation\` skill first. It provides structured guidance for navigating the app UI and controlling the knowledge base view. + **Slack:** When users ask about Slack messages, want to send messages to teammates, check channel conversations, or find someone on Slack, load the \`slack\` skill. You can send messages, view channel history, search conversations, and find users. Always show message drafts to the user before sending. ## Memory That Compounds @@ -184,6 +186,7 @@ ${runtimeContextPrompt} - \`loadSkill\` - Skill loading - \`slack-checkConnection\`, \`slack-listAvailableTools\`, \`slack-executeAction\` - Slack integration (requires Slack to be connected via Composio). Use \`slack-listAvailableTools\` first to discover available tool slugs, then \`slack-executeAction\` to execute them. - \`web-search\` and \`research-search\` - Web and research search tools (available when configured). **You MUST load the \`web-search\` skill before using either of these tools.** It tells you which tool to pick and how many searches to do. +- \`app-navigation\` - Control the app UI: open notes, switch views, filter/search the knowledge base, manage saved views. **Load the \`app-navigation\` skill before using this tool.** **Prefer these tools whenever possible** — they work instantly with zero friction. For file operations inside \`~/.rowboat/\`, always use these instead of \`executeCommand\`. diff --git a/apps/x/packages/core/src/application/assistant/skills/app-navigation/skill.ts b/apps/x/packages/core/src/application/assistant/skills/app-navigation/skill.ts index 99f143f4..9dbbce84 100644 --- a/apps/x/packages/core/src/application/assistant/skills/app-navigation/skill.ts +++ b/apps/x/packages/core/src/application/assistant/skills/app-navigation/skill.ts @@ -44,6 +44,7 @@ Change filters, columns, sort order, or search in the bases (table) view. - If unsure what categories/values are available, call ` + "`get-base-state`" + ` first. - For "show me X", prefer ` + "`filters.set`" + ` to start fresh rather than ` + "`filters.add`" + `. - Categories come from frontmatter keys (e.g., relationship, status, topic, type). +- **CRITICAL: Do NOT pass ` + "`columns`" + ` unless the user explicitly asks to show/hide specific columns.** Omit the ` + "`columns`" + ` parameter entirely when only filtering, sorting, or searching. Passing ` + "`columns`" + ` will override the user's current column layout and can make the view appear empty. ### get-base-state Retrieve information about what's in the knowledge base — available filter categories, values, and note count. @@ -75,6 +76,7 @@ Save the current view configuration as a named base. - The ` + "`update-base-view`" + ` action will automatically navigate to the bases view if the user isn't already there. - ` + "`open-note`" + ` validates that the file exists before navigating. - Filter categories and values come from frontmatter in knowledge files. +- **Never send ` + "`columns`" + ` or ` + "`sort`" + ` with ` + "`update-base-view`" + ` unless the user specifically asks to change them.** Only pass the parameters you intend to change — omitted parameters are left untouched. `; export default skill; diff --git a/apps/x/packages/core/src/application/lib/builtin-tools.ts b/apps/x/packages/core/src/application/lib/builtin-tools.ts index e50c4acd..32454b31 100644 --- a/apps/x/packages/core/src/application/lib/builtin-tools.ts +++ b/apps/x/packages/core/src/application/lib/builtin-tools.ts @@ -884,6 +884,145 @@ export const BuiltinTools: z.infer = { }, }, + // ============================================================================ + // App Navigation + // ============================================================================ + + 'app-navigation': { + description: 'Control the app UI - navigate to notes, switch views, filter/search the knowledge base, and manage saved views.', + inputSchema: z.object({ + action: z.enum(["open-note", "open-view", "update-base-view", "get-base-state", "create-base"]).describe("The navigation action to perform"), + // open-note + path: z.string().optional().describe("Knowledge file path for open-note, e.g. knowledge/People/John.md"), + // open-view + view: z.enum(["bases", "graph"]).optional().describe("Which view to open (for open-view action)"), + // update-base-view + filters: z.object({ + set: z.array(z.object({ category: z.string(), value: z.string() })).optional().describe("Replace all filters with these"), + add: z.array(z.object({ category: z.string(), value: z.string() })).optional().describe("Add these filters"), + remove: z.array(z.object({ category: z.string(), value: z.string() })).optional().describe("Remove these filters"), + clear: z.boolean().optional().describe("Clear all filters"), + }).optional().describe("Filter modifications (for update-base-view)"), + columns: z.object({ + set: z.array(z.string()).optional().describe("Replace visible columns with these"), + add: z.array(z.string()).optional().describe("Add these columns"), + remove: z.array(z.string()).optional().describe("Remove these columns"), + }).optional().describe("Column modifications (for update-base-view)"), + sort: z.object({ + field: z.string(), + dir: z.enum(["asc", "desc"]), + }).optional().describe("Sort configuration (for update-base-view)"), + search: z.string().optional().describe("Search query to filter notes (for update-base-view)"), + // get-base-state + base_name: z.string().optional().describe("Name of a saved base to inspect (for get-base-state). Omit for the current/default view."), + // create-base + name: z.string().optional().describe("Name for the saved base view (for create-base)"), + }), + execute: async (input: { + action: string; + [key: string]: unknown; + }) => { + switch (input.action) { + case 'open-note': { + const filePath = input.path as string; + try { + const result = await workspace.exists(filePath); + if (!result.exists) { + return { success: false, error: `File not found: ${filePath}` }; + } + return { success: true, action: 'open-note', path: filePath }; + } catch { + return { success: false, error: `Could not access file: ${filePath}` }; + } + } + + case 'open-view': { + const view = input.view as string; + return { success: true, action: 'open-view', view }; + } + + case 'update-base-view': { + const updates: Record = {}; + if (input.filters) updates.filters = input.filters; + if (input.columns) updates.columns = input.columns; + if (input.sort) updates.sort = input.sort; + if (input.search !== undefined) updates.search = input.search; + return { success: true, action: 'update-base-view', updates }; + } + + case 'get-base-state': { + // Scan knowledge/ files and extract frontmatter properties + try { + const { parseFrontmatter } = await import("@x/shared/dist/frontmatter.js"); + const entries = await workspace.readdir("knowledge", { recursive: true, allowedExtensions: [".md"] }); + const files = entries.filter(e => e.kind === 'file'); + const properties = new Map>(); + let noteCount = 0; + + for (const file of files) { + try { + const { data } = await workspace.readFile(file.path); + const { fields } = parseFrontmatter(data); + noteCount++; + for (const [key, value] of Object.entries(fields)) { + if (!value) continue; + let set = properties.get(key); + if (!set) { set = new Set(); properties.set(key, set); } + const values = Array.isArray(value) ? value : [value]; + for (const v of values) { + const trimmed = v.trim(); + if (trimmed) set.add(trimmed); + } + } + } catch { + // skip unreadable files + } + } + + const availableProperties: Record = {}; + for (const [key, values] of properties) { + availableProperties[key] = [...values].sort(); + } + + return { + success: true, + action: 'get-base-state', + noteCount, + availableProperties, + }; + } catch (error) { + return { + success: false, + error: error instanceof Error ? error.message : 'Failed to read knowledge base', + }; + } + } + + case 'create-base': { + const name = input.name as string; + const safeName = name.replace(/[^a-zA-Z0-9_\- ]/g, '').trim(); + if (!safeName) { + return { success: false, error: 'Invalid base name' }; + } + const basePath = `bases/${safeName}.base`; + try { + const config = { name: safeName, filters: [], columns: [] }; + await workspace.writeFile(basePath, JSON.stringify(config, null, 2), { mkdirp: true }); + return { success: true, action: 'create-base', name: safeName, path: basePath }; + } catch (error) { + return { + success: false, + error: error instanceof Error ? error.message : 'Failed to create base', + }; + } + } + + default: + return { success: false, error: `Unknown action: ${input.action}` }; + } + }, + }, + // ============================================================================ // Web Search (Brave Search API) // ============================================================================ diff --git a/apps/x/packages/core/src/application/lib/message-queue.ts b/apps/x/packages/core/src/application/lib/message-queue.ts index 2b864840..9dc0980d 100644 --- a/apps/x/packages/core/src/application/lib/message-queue.ts +++ b/apps/x/packages/core/src/application/lib/message-queue.ts @@ -3,14 +3,17 @@ import { UserMessageContent } from "@x/shared/dist/message.js"; import z from "zod"; export type UserMessageContentType = z.infer; +export type VoiceOutputMode = 'summary' | 'full'; type EnqueuedMessage = { messageId: string; message: UserMessageContentType; + voiceInput?: boolean; + voiceOutput?: VoiceOutputMode; }; export interface IMessageQueue { - enqueue(runId: string, message: UserMessageContentType): Promise; + enqueue(runId: string, message: UserMessageContentType, voiceInput?: boolean, voiceOutput?: VoiceOutputMode): Promise; dequeue(runId: string): Promise; } @@ -26,7 +29,7 @@ export class InMemoryMessageQueue implements IMessageQueue { this.idGenerator = idGenerator; } - async enqueue(runId: string, message: UserMessageContentType): Promise { + async enqueue(runId: string, message: UserMessageContentType, voiceInput?: boolean, voiceOutput?: VoiceOutputMode): Promise { if (!this.store[runId]) { this.store[runId] = []; } @@ -34,6 +37,8 @@ export class InMemoryMessageQueue implements IMessageQueue { this.store[runId].push({ messageId: id, message, + voiceInput, + voiceOutput, }); return id; } @@ -44,4 +49,4 @@ export class InMemoryMessageQueue implements IMessageQueue { } return this.store[runId].shift() ?? null; } -} \ No newline at end of file +} diff --git a/apps/x/packages/core/src/index.ts b/apps/x/packages/core/src/index.ts index 0eab08e3..894279b6 100644 --- a/apps/x/packages/core/src/index.ts +++ b/apps/x/packages/core/src/index.ts @@ -9,3 +9,6 @@ export { initConfigs } from './config/initConfigs.js'; // Knowledge version history export * as versionHistory from './knowledge/version_history.js'; + +// Voice mode (config + TTS) +export * as voice from './voice/voice.js'; diff --git a/apps/x/packages/core/src/models/models.ts b/apps/x/packages/core/src/models/models.ts index 7754bd8e..38b6801f 100644 --- a/apps/x/packages/core/src/models/models.ts +++ b/apps/x/packages/core/src/models/models.ts @@ -64,7 +64,7 @@ export function createProvider(config: z.infer): ProviderV2 { apiKey, baseURL, headers, - }); + }) as unknown as ProviderV2; default: throw new Error(`Unsupported provider flavor: ${config.flavor}`); } diff --git a/apps/x/packages/core/src/runs/runs.ts b/apps/x/packages/core/src/runs/runs.ts index 0f123497..7c2f3910 100644 --- a/apps/x/packages/core/src/runs/runs.ts +++ b/apps/x/packages/core/src/runs/runs.ts @@ -1,6 +1,6 @@ import z from "zod"; import container from "../di/container.js"; -import { IMessageQueue, UserMessageContentType } from "../application/lib/message-queue.js"; +import { IMessageQueue, UserMessageContentType, VoiceOutputMode } from "../application/lib/message-queue.js"; import { AskHumanResponseEvent, ToolPermissionRequestEvent, ToolPermissionResponseEvent, CreateRunOptions, Run, ListRunsResponse, ToolPermissionAuthorizePayload, AskHumanResponsePayload } from "@x/shared/dist/runs.js"; import { IRunsRepo } from "./repo.js"; import { IAgentRuntime } from "../agents/runtime.js"; @@ -19,9 +19,9 @@ export async function createRun(opts: z.infer): Promise return run; } -export async function createMessage(runId: string, message: UserMessageContentType): Promise { +export async function createMessage(runId: string, message: UserMessageContentType, voiceInput?: boolean, voiceOutput?: VoiceOutputMode): Promise { const queue = container.resolve('messageQueue'); - const id = await queue.enqueue(runId, message); + const id = await queue.enqueue(runId, message, voiceInput, voiceOutput); const runtime = container.resolve('agentRuntime'); runtime.trigger(runId); return id; diff --git a/apps/x/packages/core/src/voice/voice.ts b/apps/x/packages/core/src/voice/voice.ts new file mode 100644 index 00000000..b0a6e628 --- /dev/null +++ b/apps/x/packages/core/src/voice/voice.ts @@ -0,0 +1,70 @@ +import * as fs from 'fs/promises'; +import * as path from 'path'; + +const homedir = process.env.HOME || process.env.USERPROFILE || ''; + +export interface VoiceConfig { + deepgram: { apiKey: string } | null; + elevenlabs: { apiKey: string; voiceId?: string } | null; +} + +async function readJsonConfig(filename: string): Promise | null> { + try { + const configPath = path.join(homedir, '.rowboat', 'config', filename); + const raw = await fs.readFile(configPath, 'utf8'); + return JSON.parse(raw); + } catch { + return null; + } +} + +export async function getVoiceConfig(): Promise { + const dgConfig = await readJsonConfig('deepgram.json'); + const elConfig = await readJsonConfig('elevenlabs.json'); + + return { + deepgram: dgConfig?.apiKey ? { apiKey: dgConfig.apiKey as string } : null, + elevenlabs: elConfig?.apiKey + ? { apiKey: elConfig.apiKey as string, voiceId: elConfig.voiceId as string | undefined } + : null, + }; +} + +export async function synthesizeSpeech(text: string): Promise<{ audioBase64: string; mimeType: string }> { + const config = await getVoiceConfig(); + if (!config.elevenlabs) { + throw new Error('ElevenLabs not configured. Create ~/.rowboat/config/elevenlabs.json with { "apiKey": "" }'); + } + + const voiceId = config.elevenlabs.voiceId || 'UgBBYS2sOqTuMpoF3BR0'; + const url = `https://api.elevenlabs.io/v1/text-to-speech/${voiceId}`; + + console.log('[voice] synthesizing speech, text length:', text.length, 'voiceId:', voiceId); + + const response = await fetch(url, { + method: 'POST', + headers: { + 'xi-api-key': config.elevenlabs.apiKey, + 'Content-Type': 'application/json', + }, + body: JSON.stringify({ + text, + model_id: 'eleven_multilingual_v2', + voice_settings: { + stability: 0.5, + similarity_boost: 0.75, + }, + }), + }); + + if (!response.ok) { + const errText = await response.text().catch(() => 'Unknown error'); + console.error('[voice] ElevenLabs API error:', response.status, errText); + throw new Error(`ElevenLabs API error ${response.status}: ${errText}`); + } + + const arrayBuffer = await response.arrayBuffer(); + const audioBase64 = Buffer.from(arrayBuffer).toString('base64'); + console.log('[voice] synthesized audio, base64 length:', audioBase64.length); + return { audioBase64, mimeType: 'audio/mpeg' }; +} diff --git a/apps/x/packages/shared/src/ipc.ts b/apps/x/packages/shared/src/ipc.ts index 2dc268f7..97f753e1 100644 --- a/apps/x/packages/shared/src/ipc.ts +++ b/apps/x/packages/shared/src/ipc.ts @@ -130,6 +130,8 @@ const ipcSchemas = { req: z.object({ runId: z.string(), message: UserMessageContent, + voiceInput: z.boolean().optional(), + voiceOutput: z.enum(['summary', 'full']).optional(), }), res: z.object({ messageId: z.string(), @@ -460,6 +462,23 @@ const ipcSchemas = { })), }), }, + // Voice mode channels + 'voice:getConfig': { + req: z.null(), + res: z.object({ + deepgram: z.object({ apiKey: z.string() }).nullable(), + elevenlabs: z.object({ apiKey: z.string(), voiceId: z.string().optional() }).nullable(), + }), + }, + 'voice:synthesize': { + req: z.object({ + text: z.string(), + }), + res: z.object({ + audioBase64: z.string(), + mimeType: z.string(), + }), + }, // Inline task schedule classification 'inline-task:classifySchedule': { req: z.object({