voice mode with TTS input/output

2026-05-19 18:35:18 +02:00 · 2026-03-13 10:26:08 +05:30 · 2026-03-13 10:26:08 +05:30 · 47d5118448
commit 47d5118448
parent d150294af1
17 changed files with 937 additions and 15 deletions
--- a/apps/x/apps/main/src/ipc.ts
+++ b/apps/x/apps/main/src/ipc.ts
@ -38,7 +38,7 @@ import { IAgentScheduleRepo } from '@x/core/dist/agent-schedule/repo.js';
 import { IAgentScheduleStateRepo } from '@x/core/dist/agent-schedule/state-repo.js';
 import { triggerRun as triggerAgentScheduleRun } from '@x/core/dist/agent-schedule/runner.js';
 import { search } from '@x/core/dist/search/search.js';
-import { versionHistory } from '@x/core';
+import { versionHistory, voice } from '@x/core';
 import { classifySchedule } from '@x/core/dist/knowledge/inline_tasks.js';

 type InvokeChannels = ipc.InvokeChannels;
@ -352,7 +352,7 @@ export function setupIpcHandlers() {
      return runsCore.createRun(args);
    },
    'runs:createMessage': async (_event, args) => {
-      return { messageId: await runsCore.createMessage(args.runId, args.message) };
+      return { messageId: await runsCore.createMessage(args.runId, args.message, args.voiceInput, args.voiceOutput) };
    },
    'runs:authorizePermission': async (_event, args) => {
      await runsCore.authorizePermission(args.runId, args.authorization);
@ -571,5 +571,11 @@ export function setupIpcHandlers() {
      const schedule = await classifySchedule(args.instruction);
      return { schedule };
    },
+    'voice:getConfig': async () => {
+      return voice.getVoiceConfig();
+    },
+    'voice:synthesize': async (_event, args) => {
+      return voice.synthesizeSpeech(args.text);
+    },
  });
 }
--- a/apps/x/apps/main/src/main.ts
+++ b/apps/x/apps/main/src/main.ts
@ -1,4 +1,4 @@
-import { app, BrowserWindow, protocol, net, shell } from "electron";
+import { app, BrowserWindow, protocol, net, shell, session } from "electron";
 import path from "node:path";
 import {
  setupIpcHandlers,
@ -92,6 +92,15 @@ function createWindow() {
    },
  });

+  // Grant microphone permission for voice mode
+  session.defaultSession.setPermissionRequestHandler((_webContents, permission, callback) => {
+    if (permission === 'media') {
+      callback(true);
+    } else {
+      callback(false);
+    }
+  });
+
  // Show window when content is ready to prevent blank screen
  win.once("ready-to-show", () => {
    win.show();
--- a/apps/x/apps/renderer/src/App.tsx
+++ b/apps/x/apps/renderer/src/App.tsx
@ -76,6 +76,8 @@ import {
 import { AgentScheduleConfig } from '@x/shared/dist/agent-schedule.js'
 import { AgentScheduleState } from '@x/shared/dist/agent-schedule-state.js'
 import { toast } from "sonner"
+import { useVoiceMode } from '@/hooks/useVoiceMode'
+import { useVoiceTTS } from '@/hooks/useVoiceTTS'

 type DirEntry = z.infer<typeof workspace.DirEntry>
 type RunEventType = z.infer<typeof RunEvent>
@ -546,6 +548,87 @@ function App() {
  const [agentId] = useState<string>('copilot')
  const [presetMessage, setPresetMessage] = useState<string | undefined>(undefined)

+  // Voice mode state
+  const [voiceAvailable, setVoiceAvailable] = useState(false)
+  const [ttsAvailable, setTtsAvailable] = useState(false)
+  const [ttsEnabled, setTtsEnabled] = useState(false)
+  const ttsEnabledRef = useRef(false)
+  const [ttsMode, setTtsMode] = useState<'summary' | 'full'>('summary')
+  const ttsModeRef = useRef<'summary' | 'full'>('summary')
+  const [isRecording, setIsRecording] = useState(false)
+  const voiceTextBufferRef = useRef('')
+  const spokenIndexRef = useRef(0)
+  const isRecordingRef = useRef(false)
+
+  const tts = useVoiceTTS()
+  const ttsRef = useRef(tts)
+  ttsRef.current = tts
+
+  const voice = useVoiceMode()
+  const voiceRef = useRef(voice)
+  voiceRef.current = voice
+
+  // Check if voice is available on mount
+  useEffect(() => {
+    window.ipc.invoke('voice:getConfig', null).then(config => {
+      setVoiceAvailable(!!config.deepgram)
+      setTtsAvailable(!!config.elevenlabs)
+    }).catch(() => {
+      setVoiceAvailable(false)
+      setTtsAvailable(false)
+    })
+  }, [])
+
+  const handleStartRecording = useCallback(() => {
+    setIsRecording(true)
+    isRecordingRef.current = true
+    voice.start()
+  }, [voice])
+
+  const handlePromptSubmitRef = useRef<((msg: { text: string }) => void) | null>(null)
+  const pendingVoiceInputRef = useRef(false)
+
+  const handleSubmitRecording = useCallback(() => {
+    const text = voice.submit()
+    setIsRecording(false)
+    isRecordingRef.current = false
+    if (text) {
+      pendingVoiceInputRef.current = true
+      handlePromptSubmitRef.current?.({ text })
+    }
+  }, [voice])
+
+  const handleToggleTts = useCallback(() => {
+    setTtsEnabled(prev => {
+      const next = !prev
+      ttsEnabledRef.current = next
+      if (!next) {
+        ttsRef.current.cancel()
+      }
+      return next
+    })
+  }, [])
+
+  const handleTtsModeChange = useCallback((mode: 'summary' | 'full') => {
+    setTtsMode(mode)
+    ttsModeRef.current = mode
+  }, [])
+
+  const handleCancelRecording = useCallback(() => {
+    voice.cancel()
+    setIsRecording(false)
+    isRecordingRef.current = false
+  }, [voice])
+
+  // Helper to cancel recording from any navigation handler
+  const cancelRecordingIfActive = useCallback(() => {
+    if (isRecordingRef.current) {
+      voiceRef.current.cancel()
+      setIsRecording(false)
+      isRecordingRef.current = false
+    }
+  }, [])
+
  // Runs history state
  type RunListItem = { id: string; title?: string; createdAt: string; agentId: string }
  const [runs, setRuns] = useState<RunListItem[]>([])
@ -1496,6 +1579,9 @@ function App() {
        if (!isActiveRun) return
        setIsProcessing(true)
        setModelUsage(null)
+        // Reset voice buffer for new response
+        voiceTextBufferRef.current = ''
+        spokenIndexRef.current = 0
        break

      case 'run-processing-end':
@ -1545,6 +1631,20 @@ function App() {
          if (llmEvent.type === 'text-delta' && llmEvent.delta) {
            appendStreamingBuffer(event.runId, llmEvent.delta)
            setCurrentAssistantMessage(prev => prev + llmEvent.delta)
+
+            // Extract <voice> tags and send to TTS when enabled
+            voiceTextBufferRef.current += llmEvent.delta
+            const remaining = voiceTextBufferRef.current.substring(spokenIndexRef.current)
+            const voiceRegex = /<voice>([\s\S]*?)<\/voice>/g
+            let voiceMatch: RegExpExecArray | null
+            while ((voiceMatch = voiceRegex.exec(remaining)) !== null) {
+              const voiceContent = voiceMatch[1].trim()
+              console.log('[voice] extracted voice tag:', voiceContent)
+              if (voiceContent && ttsEnabledRef.current) {
+                ttsRef.current.speak(voiceContent)
+              }
+              spokenIndexRef.current += voiceMatch.index + voiceMatch[0].length
+            }
          } else if (llmEvent.type === 'tool-call') {
            setConversation(prev => [...prev, {
              id: llmEvent.toolCallId || `tool-${Date.now()}`,
@ -1584,6 +1684,7 @@ function App() {
          if (msg.role === 'assistant') {
            setCurrentAssistantMessage(currentMsg => {
              if (currentMsg) {
+                const cleanedContent = currentMsg.replace(/<\/?voice>/g, '')
                setConversation(prev => {
                  const exists = prev.some(m =>
                    m.id === event.messageId && 'role' in m && m.role === 'assistant'
@ -1592,7 +1693,7 @@ function App() {
                  return [...prev, {
                    id: event.messageId,
                    role: 'assistant',
-                    content: currentMsg,
+                    content: cleanedContent,
                    timestamp: Date.now(),
                  }]
                })
@ -1887,6 +1988,8 @@ function App() {
        await window.ipc.invoke('runs:createMessage', {
          runId: currentRunId,
          message: attachmentPayload,
+          voiceInput: pendingVoiceInputRef.current || undefined,
+          voiceOutput: ttsEnabledRef.current ? ttsModeRef.current : undefined,
        })
      } else {
        // Legacy path: plain string with optional XML-formatted @mentions.
@ -1915,11 +2018,15 @@ function App() {
        await window.ipc.invoke('runs:createMessage', {
          runId: currentRunId,
          message: formattedMessage,
+          voiceInput: pendingVoiceInputRef.current || undefined,
+          voiceOutput: ttsEnabledRef.current ? ttsModeRef.current : undefined,
        })

        titleSource = formattedMessage
      }

+      pendingVoiceInputRef.current = false
+
      if (isNewRun) {
        const inferredTitle = inferRunTitleFromMessage(titleSource)
        setRuns((prev) => {
@ -1936,6 +2043,7 @@ function App() {
      console.error('Failed to send message:', error)
    }
  }
+  handlePromptSubmitRef.current = handlePromptSubmit

  const handleStop = useCallback(async () => {
    if (!runId) return
@ -2065,6 +2173,7 @@ function App() {
  }, [])

  const openChatInNewTab = useCallback((targetRunId: string) => {
+    cancelRecordingIfActive()
    const existingTab = chatTabs.find(t => t.runId === targetRunId)
    if (existingTab) {
      // Cancel stale in-flight loads from previously focused tabs.
@ -2080,12 +2189,18 @@ function App() {
    setChatTabs(prev => [...prev, { id, runId: targetRunId }])
    setActiveChatTabId(id)
    loadRun(targetRunId)
-  }, [chatTabs, loadRun, restoreChatTabState])
+  }, [chatTabs, loadRun, restoreChatTabState, cancelRecordingIfActive])

  const switchChatTab = useCallback((tabId: string) => {
    const tab = chatTabs.find(t => t.id === tabId)
    if (!tab) return
    if (tabId === activeChatTabId) return
+    // Cancel any active recording when switching tabs
+    if (isRecordingRef.current) {
+      voiceRef.current.cancel()
+      setIsRecording(false)
+      isRecordingRef.current = false
+    }
    saveChatScrollForTab(activeChatTabId)
    // Cancel stale in-flight loads from previously focused tabs.
    loadRunRequestIdRef.current += 1
@ -2471,13 +2586,14 @@ function App() {
    const current = currentViewState
    if (viewStatesEqual(current, nextView)) return

+    cancelRecordingIfActive()
    const nextHistory = {
      back: appendUnique(historyRef.current.back, current),
      forward: [] as ViewState[],
    }
    setHistory(nextHistory)
    await applyViewState(nextView)
-  }, [appendUnique, applyViewState, currentViewState, setHistory])
+  }, [appendUnique, applyViewState, cancelRecordingIfActive, currentViewState, setHistory])

  const navigateBack = useCallback(async () => {
    const { back, forward } = historyRef.current
@ -3412,6 +3528,7 @@ function App() {
              tasksActions={{
                onNewChat: handleNewChatTab,
                onSelectRun: (runIdToLoad) => {
+                  cancelRecordingIfActive()
                  if (selectedPath || isGraphOpen) {
                    setIsChatSidebarOpen(true)
                  }
@ -3814,7 +3931,7 @@ function App() {
                                {tabState.currentAssistantMessage && (
                                  <Message from="assistant">
                                    <MessageContent>
-                                      <MessageResponse components={streamdownComponents}>{tabState.currentAssistantMessage}</MessageResponse>
+                                      <MessageResponse components={streamdownComponents}>{tabState.currentAssistantMessage.replace(/<\/?voice>/g, '')}</MessageResponse>
                                    </MessageContent>
                                  </Message>
                                )}
@ -3865,6 +3982,18 @@ function App() {
                            runId={tabState.runId}
                            initialDraft={chatDraftsRef.current.get(tab.id)}
                            onDraftChange={(text) => setChatDraftForTab(tab.id, text)}
+                            isRecording={isActive && isRecording}
+                            recordingText={isActive ? voice.interimText : undefined}
+                            recordingState={isActive ? (voice.state === 'connecting' ? 'connecting' : 'listening') : undefined}
+                            onStartRecording={isActive ? handleStartRecording : undefined}
+                            onSubmitRecording={isActive ? handleSubmitRecording : undefined}
+                            onCancelRecording={isActive ? handleCancelRecording : undefined}
+                            voiceAvailable={isActive && voiceAvailable}
+                            ttsAvailable={isActive && ttsAvailable}
+                            ttsEnabled={ttsEnabled}
+                            ttsMode={ttsMode}
+                            onToggleTts={isActive ? handleToggleTts : undefined}
+                            onTtsModeChange={isActive ? handleTtsModeChange : undefined}
                          />
                        </div>
                      )
@ -3914,6 +4043,18 @@ function App() {
                onToolOpenChangeForTab={setToolOpenForTab}
                onOpenKnowledgeFile={(path) => { navigateToFile(path) }}
                onActivate={() => setActiveShortcutPane('right')}
+                isRecording={isRecording}
+                recordingText={voice.interimText}
+                recordingState={voice.state === 'connecting' ? 'connecting' : 'listening'}
+                onStartRecording={handleStartRecording}
+                onSubmitRecording={handleSubmitRecording}
+                onCancelRecording={handleCancelRecording}
+                voiceAvailable={voiceAvailable}
+                ttsAvailable={ttsAvailable}
+                ttsEnabled={ttsEnabled}
+                ttsMode={ttsMode}
+                onToggleTts={handleToggleTts}
+                onTtsModeChange={handleTtsModeChange}
              />
            )}
            {/* Rendered last so its no-drag region paints over the sidebar drag region */}
--- a/apps/x/apps/renderer/src/components/chat-input-with-mentions.tsx
+++ b/apps/x/apps/renderer/src/components/chat-input-with-mentions.tsx
@ -1,4 +1,5 @@
 import { useCallback, useEffect, useRef, useState } from 'react'
+import { Tooltip, TooltipContent, TooltipTrigger } from '@/components/ui/tooltip'
 import {
  ArrowUp,
  AudioLines,
@ -9,7 +10,9 @@ import {
  FileSpreadsheet,
  FileText,
  FileVideo,
+  Headphones,
  LoaderIcon,
+  Mic,
  Plus,
  Square,
  X,
@ -102,6 +105,18 @@ interface ChatInputInnerProps {
  runId?: string | null
  initialDraft?: string
  onDraftChange?: (text: string) => void
+  isRecording?: boolean
+  recordingText?: string
+  recordingState?: 'connecting' | 'listening'
+  onStartRecording?: () => void
+  onSubmitRecording?: () => void
+  onCancelRecording?: () => void
+  voiceAvailable?: boolean
+  ttsAvailable?: boolean
+  ttsEnabled?: boolean
+  ttsMode?: 'summary' | 'full'
+  onToggleTts?: () => void
+  onTtsModeChange?: (mode: 'summary' | 'full') => void
 }

 function ChatInputInner({
@ -115,6 +130,18 @@ function ChatInputInner({
  runId,
  initialDraft,
  onDraftChange,
+  isRecording,
+  recordingText,
+  recordingState,
+  onStartRecording,
+  onSubmitRecording,
+  onCancelRecording,
+  voiceAvailable,
+  ttsAvailable,
+  ttsEnabled,
+  ttsMode,
+  onToggleTts,
+  onTtsModeChange,
 }: ChatInputInnerProps) {
  const controller = usePromptInputController()
  const message = controller.textInput.value
@ -367,6 +394,40 @@ function ChatInputInner({
          e.target.value = ''
        }}
      />
+      {isRecording ? (
+        /* ── Recording bar ── */
+        <div className="flex items-center gap-3 px-4 py-3">
+          <button
+            type="button"
+            onClick={onCancelRecording}
+            className="flex h-7 w-7 shrink-0 items-center justify-center rounded-full text-muted-foreground transition-colors hover:bg-muted hover:text-foreground"
+            aria-label="Cancel recording"
+          >
+            <X className="h-4 w-4" />
+          </button>
+          <div className="flex flex-1 items-center gap-2 overflow-hidden">
+            <VoiceWaveform />
+            <span className="min-w-0 flex-1 truncate text-sm text-muted-foreground">
+              {recordingState === 'connecting' ? 'Connecting...' : recordingText || 'Listening...'}
+            </span>
+          </div>
+          <Button
+            size="icon"
+            onClick={onSubmitRecording}
+            disabled={!recordingText?.trim()}
+            className={cn(
+              'h-7 w-7 shrink-0 rounded-full transition-all',
+              recordingText?.trim()
+                ? 'bg-primary text-primary-foreground hover:bg-primary/90'
+                : 'bg-muted text-muted-foreground'
+            )}
+          >
+            <ArrowUp className="h-4 w-4" />
+          </Button>
+        </div>
+      ) : (
+        /* ── Normal input ── */
+        <>
      <div className="px-4 pt-4 pb-2">
        <PromptInputTextarea
          placeholder="Type your message..."
@ -414,6 +475,63 @@ function ChatInputInner({
            </DropdownMenuContent>
          </DropdownMenu>
        )}
+        {onToggleTts && ttsAvailable && (
+          <div className="flex shrink-0 items-center">
+            <Tooltip>
+              <TooltipTrigger asChild>
+                <button
+                  type="button"
+                  onClick={onToggleTts}
+                  className={cn(
+                    'relative flex h-7 w-7 shrink-0 items-center justify-center rounded-full transition-colors',
+                    ttsEnabled
+                      ? 'text-foreground hover:bg-muted'
+                      : 'text-muted-foreground hover:bg-muted hover:text-foreground'
+                  )}
+                  aria-label={ttsEnabled ? 'Disable voice output' : 'Enable voice output'}
+                >
+                  <Headphones className="h-4 w-4" />
+                  {!ttsEnabled && (
+                    <span className="absolute inset-0 flex items-center justify-center pointer-events-none">
+                      <span className="block h-[1.5px] w-5 -rotate-45 rounded-full bg-muted-foreground" />
+                    </span>
+                  )}
+                </button>
+              </TooltipTrigger>
+              <TooltipContent side="top">
+                {ttsEnabled ? 'Voice output on' : 'Voice output off'}
+              </TooltipContent>
+            </Tooltip>
+            {ttsEnabled && onTtsModeChange && (
+              <DropdownMenu>
+                <DropdownMenuTrigger asChild>
+                  <button
+                    type="button"
+                    className="flex h-7 w-4 shrink-0 items-center justify-center text-muted-foreground transition-colors hover:text-foreground"
+                  >
+                    <ChevronDown className="h-3 w-3" />
+                  </button>
+                </DropdownMenuTrigger>
+                <DropdownMenuContent align="end">
+                  <DropdownMenuRadioGroup value={ttsMode ?? 'summary'} onValueChange={(v) => onTtsModeChange(v as 'summary' | 'full')}>
+                    <DropdownMenuRadioItem value="summary">Speak summary</DropdownMenuRadioItem>
+                    <DropdownMenuRadioItem value="full">Speak full response</DropdownMenuRadioItem>
+                  </DropdownMenuRadioGroup>
+                </DropdownMenuContent>
+              </DropdownMenu>
+            )}
+          </div>
+        )}
+        {voiceAvailable && onStartRecording && (
+          <button
+            type="button"
+            onClick={onStartRecording}
+            className="flex h-7 w-7 shrink-0 items-center justify-center rounded-full text-muted-foreground transition-colors hover:bg-muted hover:text-foreground"
+            aria-label="Voice input"
+          >
+            <Mic className="h-4 w-4" />
+          </button>
+        )}
        {isProcessing ? (
          <Button
            size="icon"
@ -448,6 +566,31 @@ function ChatInputInner({
          </Button>
        )}
      </div>
+        </>
+      )}
+    </div>
+  )
+}
+
+/** Animated waveform bars for the recording indicator */
+function VoiceWaveform() {
+  return (
+    <div className="flex items-center gap-[3px] h-5">
+      {[0, 1, 2, 3, 4].map((i) => (
+        <span
+          key={i}
+          className="w-[3px] rounded-full bg-primary"
+          style={{
+            animation: `voice-wave 1.2s ease-in-out ${i * 0.15}s infinite`,
+          }}
+        />
+      ))}
+      <style>{`
+        @keyframes voice-wave {
+          0%, 100% { height: 4px; }
+          50% { height: 16px; }
+        }
+      `}</style>
    </div>
  )
 }
@ -466,6 +609,18 @@ export interface ChatInputWithMentionsProps {
  runId?: string | null
  initialDraft?: string
  onDraftChange?: (text: string) => void
+  isRecording?: boolean
+  recordingText?: string
+  recordingState?: 'connecting' | 'listening'
+  onStartRecording?: () => void
+  onSubmitRecording?: () => void
+  onCancelRecording?: () => void
+  voiceAvailable?: boolean
+  ttsAvailable?: boolean
+  ttsEnabled?: boolean
+  ttsMode?: 'summary' | 'full'
+  onToggleTts?: () => void
+  onTtsModeChange?: (mode: 'summary' | 'full') => void
 }

 export function ChatInputWithMentions({
@ -482,6 +637,18 @@ export function ChatInputWithMentions({
  runId,
  initialDraft,
  onDraftChange,
+  isRecording,
+  recordingText,
+  recordingState,
+  onStartRecording,
+  onSubmitRecording,
+  onCancelRecording,
+  voiceAvailable,
+  ttsAvailable,
+  ttsEnabled,
+  ttsMode,
+  onToggleTts,
+  onTtsModeChange,
 }: ChatInputWithMentionsProps) {
  return (
    <PromptInputProvider knowledgeFiles={knowledgeFiles} recentFiles={recentFiles} visibleFiles={visibleFiles}>
@ -496,6 +663,18 @@ export function ChatInputWithMentions({
        runId={runId}
        initialDraft={initialDraft}
        onDraftChange={onDraftChange}
+        isRecording={isRecording}
+        recordingText={recordingText}
+        recordingState={recordingState}
+        onStartRecording={onStartRecording}
+        onSubmitRecording={onSubmitRecording}
+        onCancelRecording={onCancelRecording}
+        voiceAvailable={voiceAvailable}
+        ttsAvailable={ttsAvailable}
+        ttsEnabled={ttsEnabled}
+        ttsMode={ttsMode}
+        onToggleTts={onToggleTts}
+        onTtsModeChange={onTtsModeChange}
      />
    </PromptInputProvider>
  )
--- a/apps/x/apps/renderer/src/components/chat-sidebar.tsx
+++ b/apps/x/apps/renderer/src/components/chat-sidebar.tsx
@ -108,6 +108,19 @@ interface ChatSidebarProps {
  onToolOpenChangeForTab?: (tabId: string, toolId: string, open: boolean) => void
  onOpenKnowledgeFile?: (path: string) => void
  onActivate?: () => void
+  // Voice / TTS props
+  isRecording?: boolean
+  recordingText?: string
+  recordingState?: 'connecting' | 'listening'
+  onStartRecording?: () => void
+  onSubmitRecording?: () => void
+  onCancelRecording?: () => void
+  voiceAvailable?: boolean
+  ttsAvailable?: boolean
+  ttsEnabled?: boolean
+  ttsMode?: 'summary' | 'full'
+  onToggleTts?: () => void
+  onTtsModeChange?: (mode: 'summary' | 'full') => void
 }

 export function ChatSidebar({
@ -146,6 +159,18 @@ export function ChatSidebar({
  onToolOpenChangeForTab,
  onOpenKnowledgeFile,
  onActivate,
+  isRecording,
+  recordingText,
+  recordingState,
+  onStartRecording,
+  onSubmitRecording,
+  onCancelRecording,
+  voiceAvailable,
+  ttsAvailable,
+  ttsEnabled,
+  ttsMode,
+  onToggleTts,
+  onTtsModeChange,
 }: ChatSidebarProps) {
  const [width, setWidth] = useState(() => getInitialPaneWidth(defaultWidth))
  const [isResizing, setIsResizing] = useState(false)
@ -542,6 +567,18 @@ export function ChatSidebar({
                          runId={tabState.runId}
                          initialDraft={getInitialDraft?.(tab.id)}
                          onDraftChange={onDraftChangeForTab ? (text) => onDraftChangeForTab(tab.id, text) : undefined}
+                          isRecording={isActive && isRecording}
+                          recordingText={isActive ? recordingText : undefined}
+                          recordingState={isActive ? recordingState : undefined}
+                          onStartRecording={isActive ? onStartRecording : undefined}
+                          onSubmitRecording={isActive ? onSubmitRecording : undefined}
+                          onCancelRecording={isActive ? onCancelRecording : undefined}
+                          voiceAvailable={isActive && voiceAvailable}
+                          ttsAvailable={isActive && ttsAvailable}
+                          ttsEnabled={ttsEnabled}
+                          ttsMode={ttsMode}
+                          onToggleTts={isActive ? onToggleTts : undefined}
+                          onTtsModeChange={isActive ? onTtsModeChange : undefined}
                        />
                      </div>
                    )
--- a/apps/x/apps/renderer/src/hooks/useVoiceMode.ts
+++ b/apps/x/apps/renderer/src/hooks/useVoiceMode.ts
@ -0,0 +1,218 @@
+import { useCallback, useEffect, useRef, useState } from 'react';
+
+export type VoiceState = 'idle' | 'connecting' | 'listening';
+
+// Cache the API key so we skip the IPC call after first use
+let cachedApiKey: string | null = null;
+let apiKeyFetched = false;
+
+export function useVoiceMode() {
+    const [state, setState] = useState<VoiceState>('idle');
+    const [interimText, setInterimText] = useState('');
+    const wsRef = useRef<WebSocket | null>(null);
+    const mediaStreamRef = useRef<MediaStream | null>(null);
+    const processorRef = useRef<ScriptProcessorNode | null>(null);
+    const audioCtxRef = useRef<AudioContext | null>(null);
+    const transcriptBufferRef = useRef('');
+    const interimRef = useRef('');
+    const reconnectTimerRef = useRef<ReturnType<typeof setTimeout> | null>(null);
+    const mountedRef = useRef(true);
+
+    // Connect (or reconnect) the Deepgram WebSocket.
+    // The WS stays open while the hook is mounted; only audio capture starts/stops per recording.
+    const connectWs = useCallback(() => {
+        if (!cachedApiKey) return;
+        if (wsRef.current && (wsRef.current.readyState === WebSocket.OPEN || wsRef.current.readyState === WebSocket.CONNECTING)) return;
+
+        const ws = new WebSocket(
+            `wss://api.deepgram.com/v1/listen?model=nova-3&encoding=linear16&sample_rate=16000&channels=1&interim_results=true&smart_format=true&punctuate=true&language=en`,
+            ['token', cachedApiKey]
+        );
+        wsRef.current = ws;
+
+        ws.onopen = () => {
+            console.log('[voice] WebSocket connected');
+        };
+
+        ws.onmessage = (event) => {
+            const data = JSON.parse(event.data);
+            if (!data.channel?.alternatives?.[0]) return;
+
+            const transcript = data.channel.alternatives[0].transcript;
+            if (!transcript) return;
+
+            if (data.is_final) {
+                transcriptBufferRef.current += (transcriptBufferRef.current ? ' ' : '') + transcript;
+                interimRef.current = '';
+                setInterimText(transcriptBufferRef.current);
+            } else {
+                interimRef.current = transcript;
+                setInterimText(transcriptBufferRef.current + (transcriptBufferRef.current ? ' ' : '') + transcript);
+            }
+        };
+
+        ws.onerror = () => {
+            console.error('[voice] WebSocket error');
+        };
+
+        ws.onclose = () => {
+            console.log('[voice] WebSocket closed');
+            wsRef.current = null;
+            // Auto-reconnect after 3 seconds if still mounted
+            if (mountedRef.current && cachedApiKey) {
+                reconnectTimerRef.current = setTimeout(() => {
+                    if (mountedRef.current) connectWs();
+                }, 3000);
+            }
+        };
+    }, []);
+
+    // Fetch API key on mount and establish persistent WebSocket
+    useEffect(() => {
+        mountedRef.current = true;
+
+        const init = async () => {
+            if (!apiKeyFetched) {
+                apiKeyFetched = true;
+                try {
+                    const config = await window.ipc.invoke('voice:getConfig', null);
+                    cachedApiKey = config.deepgram?.apiKey ?? null;
+                } catch { /* ignore */ }
+            }
+            if (cachedApiKey && mountedRef.current) {
+                connectWs();
+            }
+        };
+        void init();
+
+        return () => {
+            mountedRef.current = false;
+            if (reconnectTimerRef.current) {
+                clearTimeout(reconnectTimerRef.current);
+                reconnectTimerRef.current = null;
+            }
+            // Close WS on unmount, suppress reconnect by nulling onclose
+            if (wsRef.current) {
+                wsRef.current.onclose = null;
+                wsRef.current.close();
+                wsRef.current = null;
+            }
+        };
+    }, [connectWs]);
+
+    // Stop only audio capture (mic + processor), leaving WS open
+    const stopAudioCapture = useCallback(() => {
+        if (processorRef.current) {
+            processorRef.current.disconnect();
+            processorRef.current = null;
+        }
+        if (audioCtxRef.current) {
+            audioCtxRef.current.close();
+            audioCtxRef.current = null;
+        }
+        if (mediaStreamRef.current) {
+            mediaStreamRef.current.getTracks().forEach(t => t.stop());
+            mediaStreamRef.current = null;
+        }
+        setInterimText('');
+        transcriptBufferRef.current = '';
+        interimRef.current = '';
+        setState('idle');
+    }, []);
+
+    const start = useCallback(async () => {
+        if (state !== 'idle') return;
+
+        // Ensure we have an API key
+        if (!cachedApiKey) {
+            try {
+                const config = await window.ipc.invoke('voice:getConfig', null);
+                cachedApiKey = config.deepgram?.apiKey ?? null;
+            } catch { /* ignore */ }
+        }
+        if (!cachedApiKey) {
+            console.error('Deepgram not configured');
+            return;
+        }
+
+        transcriptBufferRef.current = '';
+        interimRef.current = '';
+        setInterimText('');
+
+        // If WS isn't connected, connect and wait for it
+        if (!wsRef.current || wsRef.current.readyState !== WebSocket.OPEN) {
+            setState('connecting');
+            connectWs();
+            // Wait for WS to be ready (up to 5 seconds)
+            const wsOk = await new Promise<boolean>((resolve) => {
+                const checkInterval = setInterval(() => {
+                    if (wsRef.current?.readyState === WebSocket.OPEN) {
+                        clearInterval(checkInterval);
+                        resolve(true);
+                    }
+                }, 50);
+                setTimeout(() => {
+                    clearInterval(checkInterval);
+                    resolve(false);
+                }, 5000);
+            });
+            if (!wsOk) {
+                setState('idle');
+                return;
+            }
+        }
+
+        setState('listening');
+
+        // Start mic
+        let stream: MediaStream | null = null;
+        try {
+            stream = await navigator.mediaDevices.getUserMedia({ audio: true });
+        } catch (err) {
+            console.error('Microphone access denied:', err);
+            setState('idle');
+            return;
+        }
+
+        mediaStreamRef.current = stream;
+
+        // Start audio capture
+        const audioCtx = new AudioContext({ sampleRate: 16000 });
+        audioCtxRef.current = audioCtx;
+        const source = audioCtx.createMediaStreamSource(stream);
+        const processor = audioCtx.createScriptProcessor(4096, 1, 1);
+        processorRef.current = processor;
+
+        processor.onaudioprocess = (e) => {
+            if (!wsRef.current || wsRef.current.readyState !== WebSocket.OPEN) return;
+            const float32 = e.inputBuffer.getChannelData(0);
+            const int16 = new Int16Array(float32.length);
+            for (let i = 0; i < float32.length; i++) {
+                const s = Math.max(-1, Math.min(1, float32[i]));
+                int16[i] = s < 0 ? s * 0x8000 : s * 0x7fff;
+            }
+            wsRef.current.send(int16.buffer);
+        };
+
+        source.connect(processor);
+        processor.connect(audioCtx.destination);
+    }, [state, connectWs]);
+
+    /** Stop recording and return the full transcript (finalized + any current interim) */
+    const submit = useCallback((): string => {
+        let text = transcriptBufferRef.current;
+        if (interimRef.current) {
+            text += (text ? ' ' : '') + interimRef.current;
+        }
+        text = text.trim();
+        stopAudioCapture();
+        return text;
+    }, [stopAudioCapture]);
+
+    /** Cancel recording without returning transcript */
+    const cancel = useCallback(() => {
+        stopAudioCapture();
+    }, [stopAudioCapture]);
+
+    return { state, interimText, start, submit, cancel };
+}
--- a/apps/x/apps/renderer/src/hooks/useVoiceTTS.ts
+++ b/apps/x/apps/renderer/src/hooks/useVoiceTTS.ts
@ -0,0 +1,72 @@
+import { useCallback, useRef, useState } from 'react';
+
+export type TTSState = 'idle' | 'synthesizing' | 'speaking';
+
+export function useVoiceTTS() {
+    const [state, setState] = useState<TTSState>('idle');
+    const audioRef = useRef<HTMLAudioElement | null>(null);
+    const queueRef = useRef<string[]>([]);
+    const processingRef = useRef(false);
+
+    const processQueue = useCallback(async () => {
+        if (processingRef.current) return;
+        processingRef.current = true;
+
+        while (queueRef.current.length > 0) {
+            const text = queueRef.current.shift()!;
+            if (!text.trim()) continue;
+
+            setState('synthesizing');
+            console.log('[tts] synthesizing:', text.substring(0, 80));
+            try {
+                const result = await window.ipc.invoke('voice:synthesize', { text });
+                console.log('[tts] got audio, mimeType:', result.mimeType, 'base64 length:', result.audioBase64.length);
+                setState('speaking');
+
+                await new Promise<void>((resolve, reject) => {
+                    const dataUrl = `data:${result.mimeType};base64,${result.audioBase64}`;
+                    const audio = new Audio(dataUrl);
+                    audioRef.current = audio;
+                    audio.onended = () => {
+                        console.log('[tts] audio ended');
+                        resolve();
+                    };
+                    audio.onerror = (e) => {
+                        console.error('[tts] audio error:', e);
+                        reject(new Error('Audio playback failed'));
+                    };
+                    audio.play().then(() => {
+                        console.log('[tts] audio playing');
+                    }).catch((err) => {
+                        console.error('[tts] play() rejected:', err);
+                        reject(err);
+                    });
+                });
+            } catch (err) {
+                console.error('[tts] error:', err);
+            }
+        }
+
+        audioRef.current = null;
+        processingRef.current = false;
+        setState('idle');
+    }, []);
+
+    const speak = useCallback((text: string) => {
+        console.log('[tts] speak() called:', text.substring(0, 80));
+        queueRef.current.push(text);
+        processQueue();
+    }, [processQueue]);
+
+    const cancel = useCallback(() => {
+        queueRef.current = [];
+        if (audioRef.current) {
+            audioRef.current.pause();
+            audioRef.current = null;
+        }
+        processingRef.current = false;
+        setState('idle');
+    }, []);
+
+    return { state, speak, cancel };
+}