voice mode with TTS input/output

2026-06-12 19:55:19 +02:00 · 2026-03-13 10:26:08 +05:30 · 2026-03-13 10:26:08 +05:30 · 8db1a091f0
commit 8db1a091f0
parent e730c118dc
17 changed files with 937 additions and 15 deletions
--- a/apps/x/apps/main/src/ipc.ts
+++ b/apps/x/apps/main/src/ipc.ts
@ -38,7 +38,7 @@ import { IAgentScheduleRepo } from '@x/core/dist/agent-schedule/repo.js';
 import { IAgentScheduleStateRepo } from '@x/core/dist/agent-schedule/state-repo.js';
 import { triggerRun as triggerAgentScheduleRun } from '@x/core/dist/agent-schedule/runner.js';
 import { search } from '@x/core/dist/search/search.js';
-import { versionHistory } from '@x/core';
+import { versionHistory, voice } from '@x/core';
 import { classifySchedule } from '@x/core/dist/knowledge/inline_tasks.js';

 type InvokeChannels = ipc.InvokeChannels;
@ -352,7 +352,7 @@ export function setupIpcHandlers() {
      return runsCore.createRun(args);
    },
    'runs:createMessage': async (_event, args) => {
-      return { messageId: await runsCore.createMessage(args.runId, args.message) };
+      return { messageId: await runsCore.createMessage(args.runId, args.message, args.voiceInput, args.voiceOutput) };
    },
    'runs:authorizePermission': async (_event, args) => {
      await runsCore.authorizePermission(args.runId, args.authorization);
@ -571,5 +571,11 @@ export function setupIpcHandlers() {
      const schedule = await classifySchedule(args.instruction);
      return { schedule };
    },
+    'voice:getConfig': async () => {
+      return voice.getVoiceConfig();
+    },
+    'voice:synthesize': async (_event, args) => {
+      return voice.synthesizeSpeech(args.text);
+    },
  });
 }
--- a/apps/x/apps/main/src/main.ts
+++ b/apps/x/apps/main/src/main.ts
@ -1,4 +1,4 @@
-import { app, BrowserWindow, protocol, net, shell } from "electron";
+import { app, BrowserWindow, protocol, net, shell, session } from "electron";
 import path from "node:path";
 import {
  setupIpcHandlers,
@ -92,6 +92,15 @@ function createWindow() {
    },
  });

+  // Grant microphone permission for voice mode
+  session.defaultSession.setPermissionRequestHandler((_webContents, permission, callback) => {
+    if (permission === 'media') {
+      callback(true);
+    } else {
+      callback(false);
+    }
+  });
+
  // Show window when content is ready to prevent blank screen
  win.once("ready-to-show", () => {
    win.show();
--- a/apps/x/apps/renderer/src/App.tsx
+++ b/apps/x/apps/renderer/src/App.tsx
@ -76,6 +76,8 @@ import {
 import { AgentScheduleConfig } from '@x/shared/dist/agent-schedule.js'
 import { AgentScheduleState } from '@x/shared/dist/agent-schedule-state.js'
 import { toast } from "sonner"
+import { useVoiceMode } from '@/hooks/useVoiceMode'
+import { useVoiceTTS } from '@/hooks/useVoiceTTS'

 type DirEntry = z.infer<typeof workspace.DirEntry>
 type RunEventType = z.infer<typeof RunEvent>
@ -546,6 +548,87 @@ function App() {
  const [agentId] = useState<string>('copilot')
  const [presetMessage, setPresetMessage] = useState<string | undefined>(undefined)

+  // Voice mode state
+  const [voiceAvailable, setVoiceAvailable] = useState(false)
+  const [ttsAvailable, setTtsAvailable] = useState(false)
+  const [ttsEnabled, setTtsEnabled] = useState(false)
+  const ttsEnabledRef = useRef(false)
+  const [ttsMode, setTtsMode] = useState<'summary' | 'full'>('summary')
+  const ttsModeRef = useRef<'summary' | 'full'>('summary')
+  const [isRecording, setIsRecording] = useState(false)
+  const voiceTextBufferRef = useRef('')
+  const spokenIndexRef = useRef(0)
+  const isRecordingRef = useRef(false)
+
+  const tts = useVoiceTTS()
+  const ttsRef = useRef(tts)
+  ttsRef.current = tts
+
+  const voice = useVoiceMode()
+  const voiceRef = useRef(voice)
+  voiceRef.current = voice
+
+  // Check if voice is available on mount
+  useEffect(() => {
+    window.ipc.invoke('voice:getConfig', null).then(config => {
+      setVoiceAvailable(!!config.deepgram)
+      setTtsAvailable(!!config.elevenlabs)
+    }).catch(() => {
+      setVoiceAvailable(false)
+      setTtsAvailable(false)
+    })
+  }, [])
+
+  const handleStartRecording = useCallback(() => {
+    setIsRecording(true)
+    isRecordingRef.current = true
+    voice.start()
+  }, [voice])
+
+  const handlePromptSubmitRef = useRef<((msg: { text: string }) => void) | null>(null)
+  const pendingVoiceInputRef = useRef(false)
+
+  const handleSubmitRecording = useCallback(() => {
+    const text = voice.submit()
+    setIsRecording(false)
+    isRecordingRef.current = false
+    if (text) {
+      pendingVoiceInputRef.current = true
+      handlePromptSubmitRef.current?.({ text })
+    }
+  }, [voice])
+
+  const handleToggleTts = useCallback(() => {
+    setTtsEnabled(prev => {
+      const next = !prev
+      ttsEnabledRef.current = next
+      if (!next) {
+        ttsRef.current.cancel()
+      }
+      return next
+    })
+  }, [])
+
+  const handleTtsModeChange = useCallback((mode: 'summary' | 'full') => {
+    setTtsMode(mode)
+    ttsModeRef.current = mode
+  }, [])
+
+  const handleCancelRecording = useCallback(() => {
+    voice.cancel()
+    setIsRecording(false)
+    isRecordingRef.current = false
+  }, [voice])
+
+  // Helper to cancel recording from any navigation handler
+  const cancelRecordingIfActive = useCallback(() => {
+    if (isRecordingRef.current) {
+      voiceRef.current.cancel()
+      setIsRecording(false)
+      isRecordingRef.current = false
+    }
+  }, [])
+
  // Runs history state
  type RunListItem = { id: string; title?: string; createdAt: string; agentId: string }
  const [runs, setRuns] = useState<RunListItem[]>([])
@ -1496,6 +1579,9 @@ function App() {
        if (!isActiveRun) return
        setIsProcessing(true)
        setModelUsage(null)
+        // Reset voice buffer for new response
+        voiceTextBufferRef.current = ''
+        spokenIndexRef.current = 0
        break

      case 'run-processing-end':
@ -1545,6 +1631,20 @@ function App() {
          if (llmEvent.type === 'text-delta' && llmEvent.delta) {
            appendStreamingBuffer(event.runId, llmEvent.delta)
            setCurrentAssistantMessage(prev => prev + llmEvent.delta)
+
+            // Extract <voice> tags and send to TTS when enabled
+            voiceTextBufferRef.current += llmEvent.delta
+            const remaining = voiceTextBufferRef.current.substring(spokenIndexRef.current)
+            const voiceRegex = /<voice>([\s\S]*?)<\/voice>/g
+            let voiceMatch: RegExpExecArray | null
+            while ((voiceMatch = voiceRegex.exec(remaining)) !== null) {
+              const voiceContent = voiceMatch[1].trim()
+              console.log('[voice] extracted voice tag:', voiceContent)
+              if (voiceContent && ttsEnabledRef.current) {
+                ttsRef.current.speak(voiceContent)
+              }
+              spokenIndexRef.current += voiceMatch.index + voiceMatch[0].length
+            }
          } else if (llmEvent.type === 'tool-call') {
            setConversation(prev => [...prev, {
              id: llmEvent.toolCallId || `tool-${Date.now()}`,
@ -1584,6 +1684,7 @@ function App() {
          if (msg.role === 'assistant') {
            setCurrentAssistantMessage(currentMsg => {
              if (currentMsg) {
+                const cleanedContent = currentMsg.replace(/<\/?voice>/g, '')
                setConversation(prev => {
                  const exists = prev.some(m =>
                    m.id === event.messageId && 'role' in m && m.role === 'assistant'
@ -1592,7 +1693,7 @@ function App() {
                  return [...prev, {
                    id: event.messageId,
                    role: 'assistant',
-                    content: currentMsg,
+                    content: cleanedContent,
                    timestamp: Date.now(),
                  }]
                })
@ -1887,6 +1988,8 @@ function App() {
        await window.ipc.invoke('runs:createMessage', {
          runId: currentRunId,
          message: attachmentPayload,
+          voiceInput: pendingVoiceInputRef.current || undefined,
+          voiceOutput: ttsEnabledRef.current ? ttsModeRef.current : undefined,
        })
      } else {
        // Legacy path: plain string with optional XML-formatted @mentions.
@ -1915,11 +2018,15 @@ function App() {
        await window.ipc.invoke('runs:createMessage', {
          runId: currentRunId,
          message: formattedMessage,
+          voiceInput: pendingVoiceInputRef.current || undefined,
+          voiceOutput: ttsEnabledRef.current ? ttsModeRef.current : undefined,
        })

        titleSource = formattedMessage
      }

+      pendingVoiceInputRef.current = false
+
      if (isNewRun) {
        const inferredTitle = inferRunTitleFromMessage(titleSource)
        setRuns((prev) => {
@ -1936,6 +2043,7 @@ function App() {
      console.error('Failed to send message:', error)
    }
  }
+  handlePromptSubmitRef.current = handlePromptSubmit

  const handleStop = useCallback(async () => {
    if (!runId) return
@ -2065,6 +2173,7 @@ function App() {
  }, [])

  const openChatInNewTab = useCallback((targetRunId: string) => {
+    cancelRecordingIfActive()
    const existingTab = chatTabs.find(t => t.runId === targetRunId)
    if (existingTab) {
      // Cancel stale in-flight loads from previously focused tabs.
@ -2080,12 +2189,18 @@ function App() {
    setChatTabs(prev => [...prev, { id, runId: targetRunId }])
    setActiveChatTabId(id)
    loadRun(targetRunId)
-  }, [chatTabs, loadRun, restoreChatTabState])
+  }, [chatTabs, loadRun, restoreChatTabState, cancelRecordingIfActive])

  const switchChatTab = useCallback((tabId: string) => {
    const tab = chatTabs.find(t => t.id === tabId)
    if (!tab) return
    if (tabId === activeChatTabId) return
+    // Cancel any active recording when switching tabs
+    if (isRecordingRef.current) {
+      voiceRef.current.cancel()
+      setIsRecording(false)
+      isRecordingRef.current = false
+    }
    saveChatScrollForTab(activeChatTabId)
    // Cancel stale in-flight loads from previously focused tabs.
    loadRunRequestIdRef.current += 1
@ -2471,13 +2586,14 @@ function App() {
    const current = currentViewState
    if (viewStatesEqual(current, nextView)) return

+    cancelRecordingIfActive()
    const nextHistory = {
      back: appendUnique(historyRef.current.back, current),
      forward: [] as ViewState[],
    }
    setHistory(nextHistory)
    await applyViewState(nextView)
-  }, [appendUnique, applyViewState, currentViewState, setHistory])
+  }, [appendUnique, applyViewState, cancelRecordingIfActive, currentViewState, setHistory])

  const navigateBack = useCallback(async () => {
    const { back, forward } = historyRef.current
@ -3412,6 +3528,7 @@ function App() {
              tasksActions={{
                onNewChat: handleNewChatTab,
                onSelectRun: (runIdToLoad) => {
+                  cancelRecordingIfActive()
                  if (selectedPath || isGraphOpen) {
                    setIsChatSidebarOpen(true)
                  }
@ -3814,7 +3931,7 @@ function App() {
                                {tabState.currentAssistantMessage && (
                                  <Message from="assistant">
                                    <MessageContent>
-                                      <MessageResponse components={streamdownComponents}>{tabState.currentAssistantMessage}</MessageResponse>
+                                      <MessageResponse components={streamdownComponents}>{tabState.currentAssistantMessage.replace(/<\/?voice>/g, '')}</MessageResponse>
                                    </MessageContent>
                                  </Message>
                                )}
@ -3865,6 +3982,18 @@ function App() {
                            runId={tabState.runId}
                            initialDraft={chatDraftsRef.current.get(tab.id)}
                            onDraftChange={(text) => setChatDraftForTab(tab.id, text)}
+                            isRecording={isActive && isRecording}
+                            recordingText={isActive ? voice.interimText : undefined}
+                            recordingState={isActive ? (voice.state === 'connecting' ? 'connecting' : 'listening') : undefined}
+                            onStartRecording={isActive ? handleStartRecording : undefined}
+                            onSubmitRecording={isActive ? handleSubmitRecording : undefined}
+                            onCancelRecording={isActive ? handleCancelRecording : undefined}
+                            voiceAvailable={isActive && voiceAvailable}
+                            ttsAvailable={isActive && ttsAvailable}
+                            ttsEnabled={ttsEnabled}
+                            ttsMode={ttsMode}
+                            onToggleTts={isActive ? handleToggleTts : undefined}
+                            onTtsModeChange={isActive ? handleTtsModeChange : undefined}
                          />
                        </div>
                      )
@ -3914,6 +4043,18 @@ function App() {
                onToolOpenChangeForTab={setToolOpenForTab}
                onOpenKnowledgeFile={(path) => { navigateToFile(path) }}
                onActivate={() => setActiveShortcutPane('right')}
+                isRecording={isRecording}
+                recordingText={voice.interimText}
+                recordingState={voice.state === 'connecting' ? 'connecting' : 'listening'}
+                onStartRecording={handleStartRecording}
+                onSubmitRecording={handleSubmitRecording}
+                onCancelRecording={handleCancelRecording}
+                voiceAvailable={voiceAvailable}
+                ttsAvailable={ttsAvailable}
+                ttsEnabled={ttsEnabled}
+                ttsMode={ttsMode}
+                onToggleTts={handleToggleTts}
+                onTtsModeChange={handleTtsModeChange}
              />
            )}
            {/* Rendered last so its no-drag region paints over the sidebar drag region */}
--- a/apps/x/apps/renderer/src/components/chat-input-with-mentions.tsx
+++ b/apps/x/apps/renderer/src/components/chat-input-with-mentions.tsx
@ -1,4 +1,5 @@
 import { useCallback, useEffect, useRef, useState } from 'react'
+import { Tooltip, TooltipContent, TooltipTrigger } from '@/components/ui/tooltip'
 import {
  ArrowUp,
  AudioLines,
@ -9,7 +10,9 @@ import {
  FileSpreadsheet,
  FileText,
  FileVideo,
+  Headphones,
  LoaderIcon,
+  Mic,
  Plus,
  Square,
  X,
@ -102,6 +105,18 @@ interface ChatInputInnerProps {
  runId?: string | null
  initialDraft?: string
  onDraftChange?: (text: string) => void
+  isRecording?: boolean
+  recordingText?: string
+  recordingState?: 'connecting' | 'listening'
+  onStartRecording?: () => void
+  onSubmitRecording?: () => void
+  onCancelRecording?: () => void
+  voiceAvailable?: boolean
+  ttsAvailable?: boolean
+  ttsEnabled?: boolean
+  ttsMode?: 'summary' | 'full'
+  onToggleTts?: () => void
+  onTtsModeChange?: (mode: 'summary' | 'full') => void
 }

 function ChatInputInner({
@ -115,6 +130,18 @@ function ChatInputInner({
  runId,
  initialDraft,
  onDraftChange,
+  isRecording,
+  recordingText,
+  recordingState,
+  onStartRecording,
+  onSubmitRecording,
+  onCancelRecording,
+  voiceAvailable,
+  ttsAvailable,
+  ttsEnabled,
+  ttsMode,
+  onToggleTts,
+  onTtsModeChange,
 }: ChatInputInnerProps) {
  const controller = usePromptInputController()
  const message = controller.textInput.value
@ -367,6 +394,40 @@ function ChatInputInner({
          e.target.value = ''
        }}
      />
+      {isRecording ? (
+        /* ── Recording bar ── */
+        <div className="flex items-center gap-3 px-4 py-3">
+          <button
+            type="button"
+            onClick={onCancelRecording}
+            className="flex h-7 w-7 shrink-0 items-center justify-center rounded-full text-muted-foreground transition-colors hover:bg-muted hover:text-foreground"
+            aria-label="Cancel recording"
+          >
+            <X className="h-4 w-4" />
+          </button>
+          <div className="flex flex-1 items-center gap-2 overflow-hidden">
+            <VoiceWaveform />
+            <span className="min-w-0 flex-1 truncate text-sm text-muted-foreground">
+              {recordingState === 'connecting' ? 'Connecting...' : recordingText || 'Listening...'}
+            </span>
+          </div>
+          <Button
+            size="icon"
+            onClick={onSubmitRecording}
+            disabled={!recordingText?.trim()}
+            className={cn(
+              'h-7 w-7 shrink-0 rounded-full transition-all',
+              recordingText?.trim()
+                ? 'bg-primary text-primary-foreground hover:bg-primary/90'
+                : 'bg-muted text-muted-foreground'
+            )}
+          >
+            <ArrowUp className="h-4 w-4" />
+          </Button>
+        </div>
+      ) : (
+        /* ── Normal input ── */
+        <>
      <div className="px-4 pt-4 pb-2">
        <PromptInputTextarea
          placeholder="Type your message..."
@ -414,6 +475,63 @@ function ChatInputInner({
            </DropdownMenuContent>
          </DropdownMenu>
        )}
+        {onToggleTts && ttsAvailable && (
+          <div className="flex shrink-0 items-center">
+            <Tooltip>
+              <TooltipTrigger asChild>
+                <button
+                  type="button"
+                  onClick={onToggleTts}
+                  className={cn(
+                    'relative flex h-7 w-7 shrink-0 items-center justify-center rounded-full transition-colors',
+                    ttsEnabled
+                      ? 'text-foreground hover:bg-muted'
+                      : 'text-muted-foreground hover:bg-muted hover:text-foreground'
+                  )}
+                  aria-label={ttsEnabled ? 'Disable voice output' : 'Enable voice output'}
+                >
+                  <Headphones className="h-4 w-4" />
+                  {!ttsEnabled && (
+                    <span className="absolute inset-0 flex items-center justify-center pointer-events-none">
+                      <span className="block h-[1.5px] w-5 -rotate-45 rounded-full bg-muted-foreground" />
+                    </span>
+                  )}
+                </button>
+              </TooltipTrigger>
+              <TooltipContent side="top">
+                {ttsEnabled ? 'Voice output on' : 'Voice output off'}
+              </TooltipContent>
+            </Tooltip>
+            {ttsEnabled && onTtsModeChange && (
+              <DropdownMenu>
+                <DropdownMenuTrigger asChild>
+                  <button
+                    type="button"
+                    className="flex h-7 w-4 shrink-0 items-center justify-center text-muted-foreground transition-colors hover:text-foreground"
+                  >
+                    <ChevronDown className="h-3 w-3" />
+                  </button>
+                </DropdownMenuTrigger>
+                <DropdownMenuContent align="end">
+                  <DropdownMenuRadioGroup value={ttsMode ?? 'summary'} onValueChange={(v) => onTtsModeChange(v as 'summary' | 'full')}>
+                    <DropdownMenuRadioItem value="summary">Speak summary</DropdownMenuRadioItem>
+                    <DropdownMenuRadioItem value="full">Speak full response</DropdownMenuRadioItem>
+                  </DropdownMenuRadioGroup>
+                </DropdownMenuContent>
+              </DropdownMenu>
+            )}
+          </div>
+        )}
+        {voiceAvailable && onStartRecording && (
+          <button
+            type="button"
+            onClick={onStartRecording}
+            className="flex h-7 w-7 shrink-0 items-center justify-center rounded-full text-muted-foreground transition-colors hover:bg-muted hover:text-foreground"
+            aria-label="Voice input"
+          >
+            <Mic className="h-4 w-4" />
+          </button>
+        )}
        {isProcessing ? (
          <Button
            size="icon"
@ -448,6 +566,31 @@ function ChatInputInner({
          </Button>
        )}
      </div>
+        </>
+      )}
+    </div>
+  )
+}
+
+/** Animated waveform bars for the recording indicator */
+function VoiceWaveform() {
+  return (
+    <div className="flex items-center gap-[3px] h-5">
+      {[0, 1, 2, 3, 4].map((i) => (
+        <span
+          key={i}
+          className="w-[3px] rounded-full bg-primary"
+          style={{
+            animation: `voice-wave 1.2s ease-in-out ${i * 0.15}s infinite`,
+          }}
+        />
+      ))}
+      <style>{`
+        @keyframes voice-wave {
+          0%, 100% { height: 4px; }
+          50% { height: 16px; }
+        }
+      `}</style>
    </div>
  )
 }
@ -466,6 +609,18 @@ export interface ChatInputWithMentionsProps {
  runId?: string | null
  initialDraft?: string
  onDraftChange?: (text: string) => void
+  isRecording?: boolean
+  recordingText?: string
+  recordingState?: 'connecting' | 'listening'
+  onStartRecording?: () => void
+  onSubmitRecording?: () => void
+  onCancelRecording?: () => void
+  voiceAvailable?: boolean
+  ttsAvailable?: boolean
+  ttsEnabled?: boolean
+  ttsMode?: 'summary' | 'full'
+  onToggleTts?: () => void
+  onTtsModeChange?: (mode: 'summary' | 'full') => void
 }

 export function ChatInputWithMentions({
@ -482,6 +637,18 @@ export function ChatInputWithMentions({
  runId,
  initialDraft,
  onDraftChange,
+  isRecording,
+  recordingText,
+  recordingState,
+  onStartRecording,
+  onSubmitRecording,
+  onCancelRecording,
+  voiceAvailable,
+  ttsAvailable,
+  ttsEnabled,
+  ttsMode,
+  onToggleTts,
+  onTtsModeChange,
 }: ChatInputWithMentionsProps) {
  return (
    <PromptInputProvider knowledgeFiles={knowledgeFiles} recentFiles={recentFiles} visibleFiles={visibleFiles}>
@ -496,6 +663,18 @@ export function ChatInputWithMentions({
        runId={runId}
        initialDraft={initialDraft}
        onDraftChange={onDraftChange}
+        isRecording={isRecording}
+        recordingText={recordingText}
+        recordingState={recordingState}
+        onStartRecording={onStartRecording}
+        onSubmitRecording={onSubmitRecording}
+        onCancelRecording={onCancelRecording}
+        voiceAvailable={voiceAvailable}
+        ttsAvailable={ttsAvailable}
+        ttsEnabled={ttsEnabled}
+        ttsMode={ttsMode}
+        onToggleTts={onToggleTts}
+        onTtsModeChange={onTtsModeChange}
      />
    </PromptInputProvider>
  )
--- a/apps/x/apps/renderer/src/components/chat-sidebar.tsx
+++ b/apps/x/apps/renderer/src/components/chat-sidebar.tsx
@ -108,6 +108,19 @@ interface ChatSidebarProps {
  onToolOpenChangeForTab?: (tabId: string, toolId: string, open: boolean) => void
  onOpenKnowledgeFile?: (path: string) => void
  onActivate?: () => void
+  // Voice / TTS props
+  isRecording?: boolean
+  recordingText?: string
+  recordingState?: 'connecting' | 'listening'
+  onStartRecording?: () => void
+  onSubmitRecording?: () => void
+  onCancelRecording?: () => void
+  voiceAvailable?: boolean
+  ttsAvailable?: boolean
+  ttsEnabled?: boolean
+  ttsMode?: 'summary' | 'full'
+  onToggleTts?: () => void
+  onTtsModeChange?: (mode: 'summary' | 'full') => void
 }

 export function ChatSidebar({
@ -146,6 +159,18 @@ export function ChatSidebar({
  onToolOpenChangeForTab,
  onOpenKnowledgeFile,
  onActivate,
+  isRecording,
+  recordingText,
+  recordingState,
+  onStartRecording,
+  onSubmitRecording,
+  onCancelRecording,
+  voiceAvailable,
+  ttsAvailable,
+  ttsEnabled,
+  ttsMode,
+  onToggleTts,
+  onTtsModeChange,
 }: ChatSidebarProps) {
  const [width, setWidth] = useState(() => getInitialPaneWidth(defaultWidth))
  const [isResizing, setIsResizing] = useState(false)
@ -542,6 +567,18 @@ export function ChatSidebar({
                          runId={tabState.runId}
                          initialDraft={getInitialDraft?.(tab.id)}
                          onDraftChange={onDraftChangeForTab ? (text) => onDraftChangeForTab(tab.id, text) : undefined}
+                          isRecording={isActive && isRecording}
+                          recordingText={isActive ? recordingText : undefined}
+                          recordingState={isActive ? recordingState : undefined}
+                          onStartRecording={isActive ? onStartRecording : undefined}
+                          onSubmitRecording={isActive ? onSubmitRecording : undefined}
+                          onCancelRecording={isActive ? onCancelRecording : undefined}
+                          voiceAvailable={isActive && voiceAvailable}
+                          ttsAvailable={isActive && ttsAvailable}
+                          ttsEnabled={ttsEnabled}
+                          ttsMode={ttsMode}
+                          onToggleTts={isActive ? onToggleTts : undefined}
+                          onTtsModeChange={isActive ? onTtsModeChange : undefined}
                        />
                      </div>
                    )
--- a/apps/x/apps/renderer/src/hooks/useVoiceMode.ts
+++ b/apps/x/apps/renderer/src/hooks/useVoiceMode.ts
@ -0,0 +1,218 @@
+import { useCallback, useEffect, useRef, useState } from 'react';
+
+export type VoiceState = 'idle' | 'connecting' | 'listening';
+
+// Cache the API key so we skip the IPC call after first use
+let cachedApiKey: string | null = null;
+let apiKeyFetched = false;
+
+export function useVoiceMode() {
+    const [state, setState] = useState<VoiceState>('idle');
+    const [interimText, setInterimText] = useState('');
+    const wsRef = useRef<WebSocket | null>(null);
+    const mediaStreamRef = useRef<MediaStream | null>(null);
+    const processorRef = useRef<ScriptProcessorNode | null>(null);
+    const audioCtxRef = useRef<AudioContext | null>(null);
+    const transcriptBufferRef = useRef('');
+    const interimRef = useRef('');
+    const reconnectTimerRef = useRef<ReturnType<typeof setTimeout> | null>(null);
+    const mountedRef = useRef(true);
+
+    // Connect (or reconnect) the Deepgram WebSocket.
+    // The WS stays open while the hook is mounted; only audio capture starts/stops per recording.
+    const connectWs = useCallback(() => {
+        if (!cachedApiKey) return;
+        if (wsRef.current && (wsRef.current.readyState === WebSocket.OPEN || wsRef.current.readyState === WebSocket.CONNECTING)) return;
+
+        const ws = new WebSocket(
+            `wss://api.deepgram.com/v1/listen?model=nova-3&encoding=linear16&sample_rate=16000&channels=1&interim_results=true&smart_format=true&punctuate=true&language=en`,
+            ['token', cachedApiKey]
+        );
+        wsRef.current = ws;
+
+        ws.onopen = () => {
+            console.log('[voice] WebSocket connected');
+        };
+
+        ws.onmessage = (event) => {
+            const data = JSON.parse(event.data);
+            if (!data.channel?.alternatives?.[0]) return;
+
+            const transcript = data.channel.alternatives[0].transcript;
+            if (!transcript) return;
+
+            if (data.is_final) {
+                transcriptBufferRef.current += (transcriptBufferRef.current ? ' ' : '') + transcript;
+                interimRef.current = '';
+                setInterimText(transcriptBufferRef.current);
+            } else {
+                interimRef.current = transcript;
+                setInterimText(transcriptBufferRef.current + (transcriptBufferRef.current ? ' ' : '') + transcript);
+            }
+        };
+
+        ws.onerror = () => {
+            console.error('[voice] WebSocket error');
+        };
+
+        ws.onclose = () => {
+            console.log('[voice] WebSocket closed');
+            wsRef.current = null;
+            // Auto-reconnect after 3 seconds if still mounted
+            if (mountedRef.current && cachedApiKey) {
+                reconnectTimerRef.current = setTimeout(() => {
+                    if (mountedRef.current) connectWs();
+                }, 3000);
+            }
+        };
+    }, []);
+
+    // Fetch API key on mount and establish persistent WebSocket
+    useEffect(() => {
+        mountedRef.current = true;
+
+        const init = async () => {
+            if (!apiKeyFetched) {
+                apiKeyFetched = true;
+                try {
+                    const config = await window.ipc.invoke('voice:getConfig', null);
+                    cachedApiKey = config.deepgram?.apiKey ?? null;
+                } catch { /* ignore */ }
+            }
+            if (cachedApiKey && mountedRef.current) {
+                connectWs();
+            }
+        };
+        void init();
+
+        return () => {
+            mountedRef.current = false;
+            if (reconnectTimerRef.current) {
+                clearTimeout(reconnectTimerRef.current);
+                reconnectTimerRef.current = null;
+            }
+            // Close WS on unmount, suppress reconnect by nulling onclose
+            if (wsRef.current) {
+                wsRef.current.onclose = null;
+                wsRef.current.close();
+                wsRef.current = null;
+            }
+        };
+    }, [connectWs]);
+
+    // Stop only audio capture (mic + processor), leaving WS open
+    const stopAudioCapture = useCallback(() => {
+        if (processorRef.current) {
+            processorRef.current.disconnect();
+            processorRef.current = null;
+        }
+        if (audioCtxRef.current) {
+            audioCtxRef.current.close();
+            audioCtxRef.current = null;
+        }
+        if (mediaStreamRef.current) {
+            mediaStreamRef.current.getTracks().forEach(t => t.stop());
+            mediaStreamRef.current = null;
+        }
+        setInterimText('');
+        transcriptBufferRef.current = '';
+        interimRef.current = '';
+        setState('idle');
+    }, []);
+
+    const start = useCallback(async () => {
+        if (state !== 'idle') return;
+
+        // Ensure we have an API key
+        if (!cachedApiKey) {
+            try {
+                const config = await window.ipc.invoke('voice:getConfig', null);
+                cachedApiKey = config.deepgram?.apiKey ?? null;
+            } catch { /* ignore */ }
+        }
+        if (!cachedApiKey) {
+            console.error('Deepgram not configured');
+            return;
+        }
+
+        transcriptBufferRef.current = '';
+        interimRef.current = '';
+        setInterimText('');
+
+        // If WS isn't connected, connect and wait for it
+        if (!wsRef.current || wsRef.current.readyState !== WebSocket.OPEN) {
+            setState('connecting');
+            connectWs();
+            // Wait for WS to be ready (up to 5 seconds)
+            const wsOk = await new Promise<boolean>((resolve) => {
+                const checkInterval = setInterval(() => {
+                    if (wsRef.current?.readyState === WebSocket.OPEN) {
+                        clearInterval(checkInterval);
+                        resolve(true);
+                    }
+                }, 50);
+                setTimeout(() => {
+                    clearInterval(checkInterval);
+                    resolve(false);
+                }, 5000);
+            });
+            if (!wsOk) {
+                setState('idle');
+                return;
+            }
+        }
+
+        setState('listening');
+
+        // Start mic
+        let stream: MediaStream | null = null;
+        try {
+            stream = await navigator.mediaDevices.getUserMedia({ audio: true });
+        } catch (err) {
+            console.error('Microphone access denied:', err);
+            setState('idle');
+            return;
+        }
+
+        mediaStreamRef.current = stream;
+
+        // Start audio capture
+        const audioCtx = new AudioContext({ sampleRate: 16000 });
+        audioCtxRef.current = audioCtx;
+        const source = audioCtx.createMediaStreamSource(stream);
+        const processor = audioCtx.createScriptProcessor(4096, 1, 1);
+        processorRef.current = processor;
+
+        processor.onaudioprocess = (e) => {
+            if (!wsRef.current || wsRef.current.readyState !== WebSocket.OPEN) return;
+            const float32 = e.inputBuffer.getChannelData(0);
+            const int16 = new Int16Array(float32.length);
+            for (let i = 0; i < float32.length; i++) {
+                const s = Math.max(-1, Math.min(1, float32[i]));
+                int16[i] = s < 0 ? s * 0x8000 : s * 0x7fff;
+            }
+            wsRef.current.send(int16.buffer);
+        };
+
+        source.connect(processor);
+        processor.connect(audioCtx.destination);
+    }, [state, connectWs]);
+
+    /** Stop recording and return the full transcript (finalized + any current interim) */
+    const submit = useCallback((): string => {
+        let text = transcriptBufferRef.current;
+        if (interimRef.current) {
+            text += (text ? ' ' : '') + interimRef.current;
+        }
+        text = text.trim();
+        stopAudioCapture();
+        return text;
+    }, [stopAudioCapture]);
+
+    /** Cancel recording without returning transcript */
+    const cancel = useCallback(() => {
+        stopAudioCapture();
+    }, [stopAudioCapture]);
+
+    return { state, interimText, start, submit, cancel };
+}
--- a/apps/x/apps/renderer/src/hooks/useVoiceTTS.ts
+++ b/apps/x/apps/renderer/src/hooks/useVoiceTTS.ts
@ -0,0 +1,72 @@
+import { useCallback, useRef, useState } from 'react';
+
+export type TTSState = 'idle' | 'synthesizing' | 'speaking';
+
+export function useVoiceTTS() {
+    const [state, setState] = useState<TTSState>('idle');
+    const audioRef = useRef<HTMLAudioElement | null>(null);
+    const queueRef = useRef<string[]>([]);
+    const processingRef = useRef(false);
+
+    const processQueue = useCallback(async () => {
+        if (processingRef.current) return;
+        processingRef.current = true;
+
+        while (queueRef.current.length > 0) {
+            const text = queueRef.current.shift()!;
+            if (!text.trim()) continue;
+
+            setState('synthesizing');
+            console.log('[tts] synthesizing:', text.substring(0, 80));
+            try {
+                const result = await window.ipc.invoke('voice:synthesize', { text });
+                console.log('[tts] got audio, mimeType:', result.mimeType, 'base64 length:', result.audioBase64.length);
+                setState('speaking');
+
+                await new Promise<void>((resolve, reject) => {
+                    const dataUrl = `data:${result.mimeType};base64,${result.audioBase64}`;
+                    const audio = new Audio(dataUrl);
+                    audioRef.current = audio;
+                    audio.onended = () => {
+                        console.log('[tts] audio ended');
+                        resolve();
+                    };
+                    audio.onerror = (e) => {
+                        console.error('[tts] audio error:', e);
+                        reject(new Error('Audio playback failed'));
+                    };
+                    audio.play().then(() => {
+                        console.log('[tts] audio playing');
+                    }).catch((err) => {
+                        console.error('[tts] play() rejected:', err);
+                        reject(err);
+                    });
+                });
+            } catch (err) {
+                console.error('[tts] error:', err);
+            }
+        }
+
+        audioRef.current = null;
+        processingRef.current = false;
+        setState('idle');
+    }, []);
+
+    const speak = useCallback((text: string) => {
+        console.log('[tts] speak() called:', text.substring(0, 80));
+        queueRef.current.push(text);
+        processQueue();
+    }, [processQueue]);
+
+    const cancel = useCallback(() => {
+        queueRef.current = [];
+        if (audioRef.current) {
+            audioRef.current.pause();
+            audioRef.current = null;
+        }
+        processingRef.current = false;
+        setState('idle');
+    }, []);
+
+    return { state, speak, cancel };
+}
--- a/apps/x/packages/core/src/agents/runtime.ts
+++ b/apps/x/packages/core/src/agents/runtime.ts
@ -894,11 +894,19 @@ export async function* streamAgent({
        }

        // get any queued user messages
+        let voiceInput = false;
+        let voiceOutput: 'summary' | 'full' | null = null;
        while (true) {
            const msg = await messageQueue.dequeue(runId);
            if (!msg) {
                break;
            }
+            if (msg.voiceInput) {
+                voiceInput = true;
+            }
+            if (msg.voiceOutput) {
+                voiceOutput = msg.voiceOutput;
+            }
            loopLogger.log('dequeued user message', msg.messageId);
            yield* processEvent({
                runId,
@ -938,7 +946,18 @@ export async function* streamAgent({
            minute: '2-digit',
            timeZoneName: 'short'
        });
-        const instructionsWithDateTime = `Current date and time: ${currentDateTime}\n\n${agent.instructions}`;
+        let instructionsWithDateTime = `Current date and time: ${currentDateTime}\n\n${agent.instructions}`;
+        if (voiceInput) {
+            loopLogger.log('voice input enabled, injecting voice input prompt');
+            instructionsWithDateTime += `\n\n# Voice Input\nThe user's message was transcribed from speech. Be aware that:\n- There may be transcription errors. Silently correct obvious ones (e.g. homophones, misheard words). If an error is genuinely ambiguous, briefly mention your interpretation (e.g. "I'm assuming you meant X").\n- Spoken messages are often long-winded. The user may ramble, repeat themselves, or correct something they said earlier in the same message. Focus on their final intent, not every word verbatim.`;
+        }
+        if (voiceOutput === 'summary') {
+            loopLogger.log('voice output enabled (summary mode), injecting voice output prompt');
+            instructionsWithDateTime += `\n\n# Voice Output (MANDATORY)\nThe user has voice output enabled. You MUST start your response with <voice></voice> tags that provide a spoken summary and guide to your written response. This is NOT optional — every response MUST begin with <voice> tags.\n\nRules:\n1. ALWAYS start your response with one or more <voice> tags. Never skip them.\n2. Place ALL <voice> tags at the BEGINNING of your response, before any detailed content. Do NOT intersperse <voice> tags throughout the response.\n3. Wrap EACH spoken sentence in its own separate <voice> tag so it can be spoken incrementally. Do NOT wrap everything in a single <voice> block.\n4. Use voice as a TL;DR and navigation aid — do NOT read the entire response aloud.\n\nExample — if the user asks "what happened in my meeting with Sarah yesterday?":\n<voice>Your meeting with Sarah covered three main things: the Q2 roadmap timeline, hiring for the backend role, and the client demo next week.</voice>\n<voice>I've pulled out the key details and action items below — the demo prep notes are at the end.</voice>\n\n## Meeting with Sarah — March 11\n(Then the full detailed written response follows without any more <voice> tags.)\n\nAny text outside <voice> tags is shown visually but not spoken.`;
+        } else if (voiceOutput === 'full') {
+            loopLogger.log('voice output enabled (full mode), injecting voice output prompt');
+            instructionsWithDateTime += `\n\n# Voice Output — Full Read-Aloud (MANDATORY)\nThe user wants your ENTIRE response spoken aloud. You MUST wrap your full response in <voice></voice> tags. This is NOT optional.\n\nRules:\n1. Wrap EACH sentence in its own separate <voice> tag so it can be spoken incrementally.\n2. Write your response in a natural, conversational style suitable for listening — no markdown headings, bullet points, or formatting symbols. Use plain spoken language.\n3. Structure the content as if you are speaking to the user directly. Use transitions like "first", "also", "one more thing" instead of visual formatting.\n4. Every sentence MUST be inside a <voice> tag. Do not leave any content outside <voice> tags.\n\nExample:\n<voice>Your meeting with Sarah covered three main things.</voice>\n<voice>First, you discussed the Q2 roadmap timeline and agreed to push the launch to April.</voice>\n<voice>Second, you talked about hiring for the backend role — Sarah will send over two candidates by Friday.</voice>\n<voice>And lastly, the client demo is next week on Thursday at 2pm, and you're handling the intro slides.</voice>`;
+        }
        let streamError: string | null = null;
        for await (const event of streamLlm(
            model,
--- a/apps/x/packages/core/src/application/assistant/instructions.ts
+++ b/apps/x/packages/core/src/application/assistant/instructions.ts
@ -33,6 +33,8 @@ Rowboat is an agentic assistant for everyday work - emails, meetings, projects,

 **Document Collaboration:** When users ask you to work on a document, collaborate on writing, create a new document, edit/refine existing notes, or say things like "let's work on [X]", "help me write [X]", "create a doc for [X]", or "let's draft [X]", you MUST load the \`doc-collab\` skill first. This is required for any document creation or editing task. The skill provides structured guidance for creating, editing, and refining documents in the knowledge base.

+**App Control:** When users ask you to open notes, show the bases or graph view, filter or search notes, or manage saved views, load the \`app-navigation\` skill first. It provides structured guidance for navigating the app UI and controlling the knowledge base view.
+
 **Slack:** When users ask about Slack messages, want to send messages to teammates, check channel conversations, or find someone on Slack, load the \`slack\` skill. You can send messages, view channel history, search conversations, and find users. Always show message drafts to the user before sending.

 ## Memory That Compounds
@ -184,6 +186,7 @@ ${runtimeContextPrompt}
 - \`loadSkill\` - Skill loading
 - \`slack-checkConnection\`, \`slack-listAvailableTools\`, \`slack-executeAction\` - Slack integration (requires Slack to be connected via Composio). Use \`slack-listAvailableTools\` first to discover available tool slugs, then \`slack-executeAction\` to execute them.
 - \`web-search\` and \`research-search\` - Web and research search tools (available when configured). **You MUST load the \`web-search\` skill before using either of these tools.** It tells you which tool to pick and how many searches to do.
+- \`app-navigation\` - Control the app UI: open notes, switch views, filter/search the knowledge base, manage saved views. **Load the \`app-navigation\` skill before using this tool.**

 **Prefer these tools whenever possible** — they work instantly with zero friction. For file operations inside \`~/.rowboat/\`, always use these instead of \`executeCommand\`.

--- a/apps/x/packages/core/src/application/assistant/skills/app-navigation/skill.ts
+++ b/apps/x/packages/core/src/application/assistant/skills/app-navigation/skill.ts
@ -44,6 +44,7 @@ Change filters, columns, sort order, or search in the bases (table) view.
 - If unsure what categories/values are available, call ` + "`get-base-state`" + ` first.
 - For "show me X", prefer ` + "`filters.set`" + ` to start fresh rather than ` + "`filters.add`" + `.
 - Categories come from frontmatter keys (e.g., relationship, status, topic, type).
+- **CRITICAL: Do NOT pass ` + "`columns`" + ` unless the user explicitly asks to show/hide specific columns.** Omit the ` + "`columns`" + ` parameter entirely when only filtering, sorting, or searching. Passing ` + "`columns`" + ` will override the user's current column layout and can make the view appear empty.

 ### get-base-state
 Retrieve information about what's in the knowledge base — available filter categories, values, and note count.
@ -75,6 +76,7 @@ Save the current view configuration as a named base.
 - The ` + "`update-base-view`" + ` action will automatically navigate to the bases view if the user isn't already there.
 - ` + "`open-note`" + ` validates that the file exists before navigating.
 - Filter categories and values come from frontmatter in knowledge files.
+- **Never send ` + "`columns`" + ` or ` + "`sort`" + ` with ` + "`update-base-view`" + ` unless the user specifically asks to change them.** Only pass the parameters you intend to change — omitted parameters are left untouched.
 `;

 export default skill;
--- a/apps/x/packages/core/src/application/lib/builtin-tools.ts
+++ b/apps/x/packages/core/src/application/lib/builtin-tools.ts
@ -884,6 +884,145 @@ export const BuiltinTools: z.infer<typeof BuiltinToolsSchema> = {
        },
    },

+    // ============================================================================
+    // App Navigation
+    // ============================================================================
+
+    'app-navigation': {
+        description: 'Control the app UI - navigate to notes, switch views, filter/search the knowledge base, and manage saved views.',
+        inputSchema: z.object({
+            action: z.enum(["open-note", "open-view", "update-base-view", "get-base-state", "create-base"]).describe("The navigation action to perform"),
+            // open-note
+            path: z.string().optional().describe("Knowledge file path for open-note, e.g. knowledge/People/John.md"),
+            // open-view
+            view: z.enum(["bases", "graph"]).optional().describe("Which view to open (for open-view action)"),
+            // update-base-view
+            filters: z.object({
+                set: z.array(z.object({ category: z.string(), value: z.string() })).optional().describe("Replace all filters with these"),
+                add: z.array(z.object({ category: z.string(), value: z.string() })).optional().describe("Add these filters"),
+                remove: z.array(z.object({ category: z.string(), value: z.string() })).optional().describe("Remove these filters"),
+                clear: z.boolean().optional().describe("Clear all filters"),
+            }).optional().describe("Filter modifications (for update-base-view)"),
+            columns: z.object({
+                set: z.array(z.string()).optional().describe("Replace visible columns with these"),
+                add: z.array(z.string()).optional().describe("Add these columns"),
+                remove: z.array(z.string()).optional().describe("Remove these columns"),
+            }).optional().describe("Column modifications (for update-base-view)"),
+            sort: z.object({
+                field: z.string(),
+                dir: z.enum(["asc", "desc"]),
+            }).optional().describe("Sort configuration (for update-base-view)"),
+            search: z.string().optional().describe("Search query to filter notes (for update-base-view)"),
+            // get-base-state
+            base_name: z.string().optional().describe("Name of a saved base to inspect (for get-base-state). Omit for the current/default view."),
+            // create-base
+            name: z.string().optional().describe("Name for the saved base view (for create-base)"),
+        }),
+        execute: async (input: {
+            action: string;
+            [key: string]: unknown;
+        }) => {
+            switch (input.action) {
+                case 'open-note': {
+                    const filePath = input.path as string;
+                    try {
+                        const result = await workspace.exists(filePath);
+                        if (!result.exists) {
+                            return { success: false, error: `File not found: ${filePath}` };
+                        }
+                        return { success: true, action: 'open-note', path: filePath };
+                    } catch {
+                        return { success: false, error: `Could not access file: ${filePath}` };
+                    }
+                }
+
+                case 'open-view': {
+                    const view = input.view as string;
+                    return { success: true, action: 'open-view', view };
+                }
+
+                case 'update-base-view': {
+                    const updates: Record<string, unknown> = {};
+                    if (input.filters) updates.filters = input.filters;
+                    if (input.columns) updates.columns = input.columns;
+                    if (input.sort) updates.sort = input.sort;
+                    if (input.search !== undefined) updates.search = input.search;
+                    return { success: true, action: 'update-base-view', updates };
+                }
+
+                case 'get-base-state': {
+                    // Scan knowledge/ files and extract frontmatter properties
+                    try {
+                        const { parseFrontmatter } = await import("@x/shared/dist/frontmatter.js");
+                        const entries = await workspace.readdir("knowledge", { recursive: true, allowedExtensions: [".md"] });
+                        const files = entries.filter(e => e.kind === 'file');
+                        const properties = new Map<string, Set<string>>();
+                        let noteCount = 0;
+
+                        for (const file of files) {
+                            try {
+                                const { data } = await workspace.readFile(file.path);
+                                const { fields } = parseFrontmatter(data);
+                                noteCount++;
+                                for (const [key, value] of Object.entries(fields)) {
+                                    if (!value) continue;
+                                    let set = properties.get(key);
+                                    if (!set) { set = new Set(); properties.set(key, set); }
+                                    const values = Array.isArray(value) ? value : [value];
+                                    for (const v of values) {
+                                        const trimmed = v.trim();
+                                        if (trimmed) set.add(trimmed);
+                                    }
+                                }
+                            } catch {
+                                // skip unreadable files
+                            }
+                        }
+
+                        const availableProperties: Record<string, string[]> = {};
+                        for (const [key, values] of properties) {
+                            availableProperties[key] = [...values].sort();
+                        }
+
+                        return {
+                            success: true,
+                            action: 'get-base-state',
+                            noteCount,
+                            availableProperties,
+                        };
+                    } catch (error) {
+                        return {
+                            success: false,
+                            error: error instanceof Error ? error.message : 'Failed to read knowledge base',
+                        };
+                    }
+                }
+
+                case 'create-base': {
+                    const name = input.name as string;
+                    const safeName = name.replace(/[^a-zA-Z0-9_\- ]/g, '').trim();
+                    if (!safeName) {
+                        return { success: false, error: 'Invalid base name' };
+                    }
+                    const basePath = `bases/${safeName}.base`;
+                    try {
+                        const config = { name: safeName, filters: [], columns: [] };
+                        await workspace.writeFile(basePath, JSON.stringify(config, null, 2), { mkdirp: true });
+                        return { success: true, action: 'create-base', name: safeName, path: basePath };
+                    } catch (error) {
+                        return {
+                            success: false,
+                            error: error instanceof Error ? error.message : 'Failed to create base',
+                        };
+                    }
+                }
+
+                default:
+                    return { success: false, error: `Unknown action: ${input.action}` };
+            }
+        },
+    },
+
    // ============================================================================
    // Web Search (Brave Search API)
    // ============================================================================
--- a/apps/x/packages/core/src/application/lib/message-queue.ts
+++ b/apps/x/packages/core/src/application/lib/message-queue.ts
@ -3,14 +3,17 @@ import { UserMessageContent } from "@x/shared/dist/message.js";
 import z from "zod";

 export type UserMessageContentType = z.infer<typeof UserMessageContent>;
+export type VoiceOutputMode = 'summary' | 'full';

 type EnqueuedMessage = {
    messageId: string;
    message: UserMessageContentType;
+    voiceInput?: boolean;
+    voiceOutput?: VoiceOutputMode;
 };

 export interface IMessageQueue {
-    enqueue(runId: string, message: UserMessageContentType): Promise<string>;
+    enqueue(runId: string, message: UserMessageContentType, voiceInput?: boolean, voiceOutput?: VoiceOutputMode): Promise<string>;
    dequeue(runId: string): Promise<EnqueuedMessage | null>;
 }

@ -26,7 +29,7 @@ export class InMemoryMessageQueue implements IMessageQueue {
        this.idGenerator = idGenerator;
    }

-    async enqueue(runId: string, message: UserMessageContentType): Promise<string> {
+    async enqueue(runId: string, message: UserMessageContentType, voiceInput?: boolean, voiceOutput?: VoiceOutputMode): Promise<string> {
        if (!this.store[runId]) {
            this.store[runId] = [];
        }
@ -34,6 +37,8 @@ export class InMemoryMessageQueue implements IMessageQueue {
        this.store[runId].push({
            messageId: id,
            message,
+            voiceInput,
+            voiceOutput,
        });
        return id;
    }
@ -44,4 +49,4 @@ export class InMemoryMessageQueue implements IMessageQueue {
        }
        return this.store[runId].shift() ?? null;
    }
-}
+}
--- a/apps/x/packages/core/src/index.ts
+++ b/apps/x/packages/core/src/index.ts
@ -9,3 +9,6 @@ export { initConfigs } from './config/initConfigs.js';

 // Knowledge version history
 export * as versionHistory from './knowledge/version_history.js';
+
+// Voice mode (config + TTS)
+export * as voice from './voice/voice.js';
--- a/apps/x/packages/core/src/models/models.ts
+++ b/apps/x/packages/core/src/models/models.ts
@ -64,7 +64,7 @@ export function createProvider(config: z.infer<typeof Provider>): ProviderV2 {
                apiKey,
                baseURL,
                headers,
-            });
+            }) as unknown as ProviderV2;
        default:
            throw new Error(`Unsupported provider flavor: ${config.flavor}`);
    }
--- a/apps/x/packages/core/src/runs/runs.ts
+++ b/apps/x/packages/core/src/runs/runs.ts
@ -1,6 +1,6 @@
 import z from "zod";
 import container from "../di/container.js";
-import { IMessageQueue, UserMessageContentType } from "../application/lib/message-queue.js";
+import { IMessageQueue, UserMessageContentType, VoiceOutputMode } from "../application/lib/message-queue.js";
 import { AskHumanResponseEvent, ToolPermissionRequestEvent, ToolPermissionResponseEvent, CreateRunOptions, Run, ListRunsResponse, ToolPermissionAuthorizePayload, AskHumanResponsePayload } from "@x/shared/dist/runs.js";
 import { IRunsRepo } from "./repo.js";
 import { IAgentRuntime } from "../agents/runtime.js";
@ -19,9 +19,9 @@ export async function createRun(opts: z.infer<typeof CreateRunOptions>): Promise
    return run;
 }

-export async function createMessage(runId: string, message: UserMessageContentType): Promise<string> {
+export async function createMessage(runId: string, message: UserMessageContentType, voiceInput?: boolean, voiceOutput?: VoiceOutputMode): Promise<string> {
    const queue = container.resolve<IMessageQueue>('messageQueue');
-    const id = await queue.enqueue(runId, message);
+    const id = await queue.enqueue(runId, message, voiceInput, voiceOutput);
    const runtime = container.resolve<IAgentRuntime>('agentRuntime');
    runtime.trigger(runId);
    return id;
--- a/apps/x/packages/core/src/voice/voice.ts
+++ b/apps/x/packages/core/src/voice/voice.ts
@ -0,0 +1,70 @@
+import * as fs from 'fs/promises';
+import * as path from 'path';
+
+const homedir = process.env.HOME || process.env.USERPROFILE || '';
+
+export interface VoiceConfig {
+    deepgram: { apiKey: string } | null;
+    elevenlabs: { apiKey: string; voiceId?: string } | null;
+}
+
+async function readJsonConfig(filename: string): Promise<Record<string, unknown> | null> {
+    try {
+        const configPath = path.join(homedir, '.rowboat', 'config', filename);
+        const raw = await fs.readFile(configPath, 'utf8');
+        return JSON.parse(raw);
+    } catch {
+        return null;
+    }
+}
+
+export async function getVoiceConfig(): Promise<VoiceConfig> {
+    const dgConfig = await readJsonConfig('deepgram.json');
+    const elConfig = await readJsonConfig('elevenlabs.json');
+
+    return {
+        deepgram: dgConfig?.apiKey ? { apiKey: dgConfig.apiKey as string } : null,
+        elevenlabs: elConfig?.apiKey
+            ? { apiKey: elConfig.apiKey as string, voiceId: elConfig.voiceId as string | undefined }
+            : null,
+    };
+}
+
+export async function synthesizeSpeech(text: string): Promise<{ audioBase64: string; mimeType: string }> {
+    const config = await getVoiceConfig();
+    if (!config.elevenlabs) {
+        throw new Error('ElevenLabs not configured. Create ~/.rowboat/config/elevenlabs.json with { "apiKey": "<your-key>" }');
+    }
+
+    const voiceId = config.elevenlabs.voiceId || 'UgBBYS2sOqTuMpoF3BR0';
+    const url = `https://api.elevenlabs.io/v1/text-to-speech/${voiceId}`;
+
+    console.log('[voice] synthesizing speech, text length:', text.length, 'voiceId:', voiceId);
+
+    const response = await fetch(url, {
+        method: 'POST',
+        headers: {
+            'xi-api-key': config.elevenlabs.apiKey,
+            'Content-Type': 'application/json',
+        },
+        body: JSON.stringify({
+            text,
+            model_id: 'eleven_multilingual_v2',
+            voice_settings: {
+                stability: 0.5,
+                similarity_boost: 0.75,
+            },
+        }),
+    });
+
+    if (!response.ok) {
+        const errText = await response.text().catch(() => 'Unknown error');
+        console.error('[voice] ElevenLabs API error:', response.status, errText);
+        throw new Error(`ElevenLabs API error ${response.status}: ${errText}`);
+    }
+
+    const arrayBuffer = await response.arrayBuffer();
+    const audioBase64 = Buffer.from(arrayBuffer).toString('base64');
+    console.log('[voice] synthesized audio, base64 length:', audioBase64.length);
+    return { audioBase64, mimeType: 'audio/mpeg' };
+}
--- a/apps/x/packages/shared/src/ipc.ts
+++ b/apps/x/packages/shared/src/ipc.ts
@ -130,6 +130,8 @@ const ipcSchemas = {
    req: z.object({
      runId: z.string(),
      message: UserMessageContent,
+      voiceInput: z.boolean().optional(),
+      voiceOutput: z.enum(['summary', 'full']).optional(),
    }),
    res: z.object({
      messageId: z.string(),
@ -460,6 +462,23 @@ const ipcSchemas = {
      })),
    }),
  },
+  // Voice mode channels
+  'voice:getConfig': {
+    req: z.null(),
+    res: z.object({
+      deepgram: z.object({ apiKey: z.string() }).nullable(),
+      elevenlabs: z.object({ apiKey: z.string(), voiceId: z.string().optional() }).nullable(),
+    }),
+  },
+  'voice:synthesize': {
+    req: z.object({
+      text: z.string(),
+    }),
+    res: z.object({
+      audioBase64: z.string(),
+      mimeType: z.string(),
+    }),
+  },
  // Inline task schedule classification
  'inline-task:classifySchedule': {
    req: z.object({