voice mode with TTS input/output

2026-07-18 21:21:11 +02:00 · 2026-03-13 10:26:08 +05:30 · 2026-03-13 10:26:08 +05:30 · 8db1a091f0
commit 8db1a091f0
parent e730c118dc
17 changed files with 937 additions and 15 deletions
--- a/apps/x/apps/main/src/ipc.ts
+++ b/apps/x/apps/main/src/ipc.ts
@ -38,7 +38,7 @@ import { IAgentScheduleRepo } from '@x/core/dist/agent-schedule/repo.js';
 import { IAgentScheduleStateRepo } from '@x/core/dist/agent-schedule/state-repo.js';
 import { triggerRun as triggerAgentScheduleRun } from '@x/core/dist/agent-schedule/runner.js';
 import { search } from '@x/core/dist/search/search.js';
-import { versionHistory } from '@x/core';
+import { versionHistory, voice } from '@x/core';
 import { classifySchedule } from '@x/core/dist/knowledge/inline_tasks.js';
 type InvokeChannels = ipc.InvokeChannels;
@ -352,7 +352,7 @@ export function setupIpcHandlers() {
      return runsCore.createRun(args);
    },
    'runs:createMessage': async (_event, args) => {
-      return { messageId: await runsCore.createMessage(args.runId, args.message) };
+      return { messageId: await runsCore.createMessage(args.runId, args.message, args.voiceInput, args.voiceOutput) };
    },
    'runs:authorizePermission': async (_event, args) => {
      await runsCore.authorizePermission(args.runId, args.authorization);
@ -571,5 +571,11 @@ export function setupIpcHandlers() {
      const schedule = await classifySchedule(args.instruction);
      return { schedule };
    },
    'voice:getConfig': async () => {
      return voice.getVoiceConfig();
    },
    'voice:synthesize': async (_event, args) => {
      return voice.synthesizeSpeech(args.text);
    },
  });
 }
--- a/apps/x/apps/main/src/main.ts
+++ b/apps/x/apps/main/src/main.ts
@ -1,4 +1,4 @@
-import { app, BrowserWindow, protocol, net, shell } from "electron";
+import { app, BrowserWindow, protocol, net, shell, session } from "electron";
 import path from "node:path";
 import {
  setupIpcHandlers,
@ -92,6 +92,15 @@ function createWindow() {
    },
  });
  // Grant microphone permission for voice mode
  session.defaultSession.setPermissionRequestHandler((_webContents, permission, callback) => {
    if (permission === 'media') {
      callback(true);
    } else {
      callback(false);
    }
  });
  // Show window when content is ready to prevent blank screen
  win.once("ready-to-show", () => {
    win.show();
--- a/apps/x/apps/renderer/src/App.tsx
+++ b/apps/x/apps/renderer/src/App.tsx
@ -76,6 +76,8 @@ import {
 import { AgentScheduleConfig } from '@x/shared/dist/agent-schedule.js'
 import { AgentScheduleState } from '@x/shared/dist/agent-schedule-state.js'
 import { toast } from "sonner"
 import { useVoiceMode } from '@/hooks/useVoiceMode'
 import { useVoiceTTS } from '@/hooks/useVoiceTTS'
 type DirEntry = z.infer<typeof workspace.DirEntry>
 type RunEventType = z.infer<typeof RunEvent>
@ -546,6 +548,87 @@ function App() {
  const [agentId] = useState<string>('copilot')
  const [presetMessage, setPresetMessage] = useState<string | undefined>(undefined)
  // Voice mode state
  const [voiceAvailable, setVoiceAvailable] = useState(false)
  const [ttsAvailable, setTtsAvailable] = useState(false)
  const [ttsEnabled, setTtsEnabled] = useState(false)
  const ttsEnabledRef = useRef(false)
  const [ttsMode, setTtsMode] = useState<'summary' | 'full'>('summary')
  const ttsModeRef = useRef<'summary' | 'full'>('summary')
  const [isRecording, setIsRecording] = useState(false)
  const voiceTextBufferRef = useRef('')
  const spokenIndexRef = useRef(0)
  const isRecordingRef = useRef(false)
  const tts = useVoiceTTS()
  const ttsRef = useRef(tts)
  ttsRef.current = tts
  const voice = useVoiceMode()
  const voiceRef = useRef(voice)
  voiceRef.current = voice
  // Check if voice is available on mount
  useEffect(() => {
    window.ipc.invoke('voice:getConfig', null).then(config => {
      setVoiceAvailable(!!config.deepgram)
      setTtsAvailable(!!config.elevenlabs)
    }).catch(() => {
      setVoiceAvailable(false)
      setTtsAvailable(false)
    })
  }, [])
  const handleStartRecording = useCallback(() => {
    setIsRecording(true)
    isRecordingRef.current = true
    voice.start()
  }, [voice])
  const handlePromptSubmitRef = useRef<((msg: { text: string }) => void) | null>(null)
  const pendingVoiceInputRef = useRef(false)
  const handleSubmitRecording = useCallback(() => {
    const text = voice.submit()
    setIsRecording(false)
    isRecordingRef.current = false
    if (text) {
      pendingVoiceInputRef.current = true
      handlePromptSubmitRef.current?.({ text })
    }
  }, [voice])
  const handleToggleTts = useCallback(() => {
    setTtsEnabled(prev => {
      const next = !prev
      ttsEnabledRef.current = next
      if (!next) {
        ttsRef.current.cancel()
      }
      return next
    })
  }, [])
  const handleTtsModeChange = useCallback((mode: 'summary' | 'full') => {
    setTtsMode(mode)
    ttsModeRef.current = mode
  }, [])
  const handleCancelRecording = useCallback(() => {
    voice.cancel()
    setIsRecording(false)
    isRecordingRef.current = false
  }, [voice])
  // Helper to cancel recording from any navigation handler
  const cancelRecordingIfActive = useCallback(() => {
    if (isRecordingRef.current) {
      voiceRef.current.cancel()
      setIsRecording(false)
      isRecordingRef.current = false
    }
  }, [])
  // Runs history state
  type RunListItem = { id: string; title?: string; createdAt: string; agentId: string }
  const [runs, setRuns] = useState<RunListItem[]>([])
@ -1496,6 +1579,9 @@ function App() {
        if (!isActiveRun) return
        setIsProcessing(true)
        setModelUsage(null)
        // Reset voice buffer for new response
        voiceTextBufferRef.current = ''
        spokenIndexRef.current = 0
        break
      case 'run-processing-end':
@ -1545,6 +1631,20 @@ function App() {
          if (llmEvent.type === 'text-delta' && llmEvent.delta) {
            appendStreamingBuffer(event.runId, llmEvent.delta)
            setCurrentAssistantMessage(prev => prev + llmEvent.delta)
            // Extract <voice> tags and send to TTS when enabled
            voiceTextBufferRef.current += llmEvent.delta
            const remaining = voiceTextBufferRef.current.substring(spokenIndexRef.current)
            const voiceRegex = /<voice>([\s\S]*?)<\/voice>/g
            let voiceMatch: RegExpExecArray | null
            while ((voiceMatch = voiceRegex.exec(remaining)) !== null) {
              const voiceContent = voiceMatch[1].trim()
              console.log('[voice] extracted voice tag:', voiceContent)
              if (voiceContent && ttsEnabledRef.current) {
                ttsRef.current.speak(voiceContent)
              }
              spokenIndexRef.current += voiceMatch.index + voiceMatch[0].length
            }
          } else if (llmEvent.type === 'tool-call') {
            setConversation(prev => [...prev, {
              id: llmEvent.toolCallId || `tool-${Date.now()}`,
@ -1584,6 +1684,7 @@ function App() {
          if (msg.role === 'assistant') {
            setCurrentAssistantMessage(currentMsg => {
              if (currentMsg) {
                const cleanedContent = currentMsg.replace(/<\/?voice>/g, '')
                setConversation(prev => {
                  const exists = prev.some(m =>
                    m.id === event.messageId && 'role' in m && m.role === 'assistant'
@ -1592,7 +1693,7 @@ function App() {
                  return [...prev, {
                    id: event.messageId,
                    role: 'assistant',
-                    content: currentMsg,
+                    content: cleanedContent,
                    timestamp: Date.now(),
                  }]
                })
@ -1887,6 +1988,8 @@ function App() {
        await window.ipc.invoke('runs:createMessage', {
          runId: currentRunId,
          message: attachmentPayload,
          voiceInput: pendingVoiceInputRef.current || undefined,
          voiceOutput: ttsEnabledRef.current ? ttsModeRef.current : undefined,
        })
      } else {
        // Legacy path: plain string with optional XML-formatted @mentions.
@ -1915,11 +2018,15 @@ function App() {
        await window.ipc.invoke('runs:createMessage', {
          runId: currentRunId,
          message: formattedMessage,
          voiceInput: pendingVoiceInputRef.current || undefined,
          voiceOutput: ttsEnabledRef.current ? ttsModeRef.current : undefined,
        })
        titleSource = formattedMessage
      }
      pendingVoiceInputRef.current = false
      if (isNewRun) {
        const inferredTitle = inferRunTitleFromMessage(titleSource)
        setRuns((prev) => {
@ -1936,6 +2043,7 @@ function App() {
      console.error('Failed to send message:', error)
    }
  }
  handlePromptSubmitRef.current = handlePromptSubmit
  const handleStop = useCallback(async () => {
    if (!runId) return
@ -2065,6 +2173,7 @@ function App() {
  }, [])
  const openChatInNewTab = useCallback((targetRunId: string) => {
    cancelRecordingIfActive()
    const existingTab = chatTabs.find(t => t.runId === targetRunId)
    if (existingTab) {
      // Cancel stale in-flight loads from previously focused tabs.
@ -2080,12 +2189,18 @@ function App() {
    setChatTabs(prev => [...prev, { id, runId: targetRunId }])
    setActiveChatTabId(id)
    loadRun(targetRunId)
-  }, [chatTabs, loadRun, restoreChatTabState])
+  }, [chatTabs, loadRun, restoreChatTabState, cancelRecordingIfActive])
  const switchChatTab = useCallback((tabId: string) => {
    const tab = chatTabs.find(t => t.id === tabId)
    if (!tab) return
    if (tabId === activeChatTabId) return
    // Cancel any active recording when switching tabs
    if (isRecordingRef.current) {
      voiceRef.current.cancel()
      setIsRecording(false)
      isRecordingRef.current = false
    }
    saveChatScrollForTab(activeChatTabId)
    // Cancel stale in-flight loads from previously focused tabs.
    loadRunRequestIdRef.current += 1
@ -2471,13 +2586,14 @@ function App() {
    const current = currentViewState
    if (viewStatesEqual(current, nextView)) return
    cancelRecordingIfActive()
    const nextHistory = {
      back: appendUnique(historyRef.current.back, current),
      forward: [] as ViewState[],
    }
    setHistory(nextHistory)
    await applyViewState(nextView)
-  }, [appendUnique, applyViewState, currentViewState, setHistory])
+  }, [appendUnique, applyViewState, cancelRecordingIfActive, currentViewState, setHistory])
  const navigateBack = useCallback(async () => {
    const { back, forward } = historyRef.current
@ -3412,6 +3528,7 @@ function App() {
              tasksActions={{
                onNewChat: handleNewChatTab,
                onSelectRun: (runIdToLoad) => {
                  cancelRecordingIfActive()
                  if (selectedPath || isGraphOpen) {
                    setIsChatSidebarOpen(true)
                  }
@ -3814,7 +3931,7 @@ function App() {
                                {tabState.currentAssistantMessage && (
                                  <Message from="assistant">
                                    <MessageContent>
-                                      <MessageResponse components={streamdownComponents}>{tabState.currentAssistantMessage}</MessageResponse>
+                                      <MessageResponse components={streamdownComponents}>{tabState.currentAssistantMessage.replace(/<\/?voice>/g, '')}</MessageResponse>
                                    </MessageContent>
                                  </Message>
                                )}
@ -3865,6 +3982,18 @@ function App() {
                            runId={tabState.runId}
                            initialDraft={chatDraftsRef.current.get(tab.id)}
                            onDraftChange={(text) => setChatDraftForTab(tab.id, text)}
                            isRecording={isActive && isRecording}
                            recordingText={isActive ? voice.interimText : undefined}
                            recordingState={isActive ? (voice.state === 'connecting' ? 'connecting' : 'listening') : undefined}
                            onStartRecording={isActive ? handleStartRecording : undefined}
                            onSubmitRecording={isActive ? handleSubmitRecording : undefined}
                            onCancelRecording={isActive ? handleCancelRecording : undefined}
                            voiceAvailable={isActive && voiceAvailable}
                            ttsAvailable={isActive && ttsAvailable}
                            ttsEnabled={ttsEnabled}
                            ttsMode={ttsMode}
                            onToggleTts={isActive ? handleToggleTts : undefined}
                            onTtsModeChange={isActive ? handleTtsModeChange : undefined}
                          />
                        </div>
                      )
@ -3914,6 +4043,18 @@ function App() {
                onToolOpenChangeForTab={setToolOpenForTab}
                onOpenKnowledgeFile={(path) => { navigateToFile(path) }}
                onActivate={() => setActiveShortcutPane('right')}
                isRecording={isRecording}
                recordingText={voice.interimText}
                recordingState={voice.state === 'connecting' ? 'connecting' : 'listening'}
                onStartRecording={handleStartRecording}
                onSubmitRecording={handleSubmitRecording}
                onCancelRecording={handleCancelRecording}
                voiceAvailable={voiceAvailable}
                ttsAvailable={ttsAvailable}
                ttsEnabled={ttsEnabled}
                ttsMode={ttsMode}
                onToggleTts={handleToggleTts}
                onTtsModeChange={handleTtsModeChange}
              />
            )}
            {/* Rendered last so its no-drag region paints over the sidebar drag region */}
--- a/apps/x/apps/renderer/src/components/chat-input-with-mentions.tsx
+++ b/apps/x/apps/renderer/src/components/chat-input-with-mentions.tsx
@ -1,4 +1,5 @@
 import { useCallback, useEffect, useRef, useState } from 'react'
 import { Tooltip, TooltipContent, TooltipTrigger } from '@/components/ui/tooltip'
 import {
  ArrowUp,
  AudioLines,
@ -9,7 +10,9 @@ import {
  FileSpreadsheet,
  FileText,
  FileVideo,
  Headphones,
  LoaderIcon,
  Mic,
  Plus,
  Square,
  X,
@ -102,6 +105,18 @@ interface ChatInputInnerProps {
  runId?: string | null
  initialDraft?: string
  onDraftChange?: (text: string) => void
  isRecording?: boolean
  recordingText?: string
  recordingState?: 'connecting' | 'listening'
  onStartRecording?: () => void
  onSubmitRecording?: () => void
  onCancelRecording?: () => void
  voiceAvailable?: boolean
  ttsAvailable?: boolean
  ttsEnabled?: boolean
  ttsMode?: 'summary' | 'full'
  onToggleTts?: () => void
  onTtsModeChange?: (mode: 'summary' | 'full') => void
 }
 function ChatInputInner({
@ -115,6 +130,18 @@ function ChatInputInner({
  runId,
  initialDraft,
  onDraftChange,
  isRecording,
  recordingText,
  recordingState,
  onStartRecording,
  onSubmitRecording,
  onCancelRecording,
  voiceAvailable,
  ttsAvailable,
  ttsEnabled,
  ttsMode,
  onToggleTts,
  onTtsModeChange,
 }: ChatInputInnerProps) {
  const controller = usePromptInputController()
  const message = controller.textInput.value
@ -367,6 +394,40 @@ function ChatInputInner({
          e.target.value = ''
        }}
      />
      {isRecording ? (
        /* ── Recording bar ── */
        <div className="flex items-center gap-3 px-4 py-3">
          <button
            type="button"
            onClick={onCancelRecording}
            className="flex h-7 w-7 shrink-0 items-center justify-center rounded-full text-muted-foreground transition-colors hover:bg-muted hover:text-foreground"
            aria-label="Cancel recording"
          >
            <X className="h-4 w-4" />
          </button>
          <div className="flex flex-1 items-center gap-2 overflow-hidden">
            <VoiceWaveform />
            <span className="min-w-0 flex-1 truncate text-sm text-muted-foreground">
              {recordingState === 'connecting' ? 'Connecting...' : recordingText || 'Listening...'}
            </span>
          </div>
          <Button
            size="icon"
            onClick={onSubmitRecording}
            disabled={!recordingText?.trim()}
            className={cn(
              'h-7 w-7 shrink-0 rounded-full transition-all',
              recordingText?.trim()
                ? 'bg-primary text-primary-foreground hover:bg-primary/90'
                : 'bg-muted text-muted-foreground'
            )}
          >
            <ArrowUp className="h-4 w-4" />
          </Button>
        </div>
      ) : (
        /* ── Normal input ── */
        <>
      <div className="px-4 pt-4 pb-2">
        <PromptInputTextarea
          placeholder="Type your message..."
@ -414,6 +475,63 @@ function ChatInputInner({
            </DropdownMenuContent>
          </DropdownMenu>
        )}
        {onToggleTts && ttsAvailable && (
          <div className="flex shrink-0 items-center">
            <Tooltip>
              <TooltipTrigger asChild>
                <button
                  type="button"
                  onClick={onToggleTts}
                  className={cn(
                    'relative flex h-7 w-7 shrink-0 items-center justify-center rounded-full transition-colors',
                    ttsEnabled
                      ? 'text-foreground hover:bg-muted'
                      : 'text-muted-foreground hover:bg-muted hover:text-foreground'
                  )}
                  aria-label={ttsEnabled ? 'Disable voice output' : 'Enable voice output'}
                >
                  <Headphones className="h-4 w-4" />
                  {!ttsEnabled && (
                    <span className="absolute inset-0 flex items-center justify-center pointer-events-none">
                      <span className="block h-[1.5px] w-5 -rotate-45 rounded-full bg-muted-foreground" />
                    </span>
                  )}
                </button>
              </TooltipTrigger>
              <TooltipContent side="top">
                {ttsEnabled ? 'Voice output on' : 'Voice output off'}
              </TooltipContent>
            </Tooltip>
            {ttsEnabled && onTtsModeChange && (
              <DropdownMenu>
                <DropdownMenuTrigger asChild>
                  <button
                    type="button"
                    className="flex h-7 w-4 shrink-0 items-center justify-center text-muted-foreground transition-colors hover:text-foreground"
                  >
                    <ChevronDown className="h-3 w-3" />
                  </button>
                </DropdownMenuTrigger>
                <DropdownMenuContent align="end">
                  <DropdownMenuRadioGroup value={ttsMode ?? 'summary'} onValueChange={(v) => onTtsModeChange(v as 'summary' | 'full')}>
                    <DropdownMenuRadioItem value="summary">Speak summary</DropdownMenuRadioItem>
                    <DropdownMenuRadioItem value="full">Speak full response</DropdownMenuRadioItem>
                  </DropdownMenuRadioGroup>
                </DropdownMenuContent>
              </DropdownMenu>
            )}
          </div>
        )}
        {voiceAvailable && onStartRecording && (
          <button
            type="button"
            onClick={onStartRecording}
            className="flex h-7 w-7 shrink-0 items-center justify-center rounded-full text-muted-foreground transition-colors hover:bg-muted hover:text-foreground"
            aria-label="Voice input"
          >
            <Mic className="h-4 w-4" />
          </button>
        )}
        {isProcessing ? (
          <Button
            size="icon"
@ -448,6 +566,31 @@ function ChatInputInner({
          </Button>
        )}
      </div>
        </>
      )}
    </div>
  )
 }
 /** Animated waveform bars for the recording indicator */
 function VoiceWaveform() {
  return (
    <div className="flex items-center gap-[3px] h-5">
      {[0, 1, 2, 3, 4].map((i) => (
        <span
          key={i}
          className="w-[3px] rounded-full bg-primary"
          style={{
            animation: `voice-wave 1.2s ease-in-out ${i * 0.15}s infinite`,
          }}
        />
      ))}
      <style>{`
        @keyframes voice-wave {
          0%, 100% { height: 4px; }
          50% { height: 16px; }
        }
      `}</style>
    </div>
  )
 }
@ -466,6 +609,18 @@ export interface ChatInputWithMentionsProps {
  runId?: string | null
  initialDraft?: string
  onDraftChange?: (text: string) => void
  isRecording?: boolean
  recordingText?: string
  recordingState?: 'connecting' | 'listening'
  onStartRecording?: () => void
  onSubmitRecording?: () => void
  onCancelRecording?: () => void
  voiceAvailable?: boolean
  ttsAvailable?: boolean
  ttsEnabled?: boolean
  ttsMode?: 'summary' | 'full'
  onToggleTts?: () => void
  onTtsModeChange?: (mode: 'summary' | 'full') => void
 }
 export function ChatInputWithMentions({
@ -482,6 +637,18 @@ export function ChatInputWithMentions({
  runId,
  initialDraft,
  onDraftChange,
  isRecording,
  recordingText,
  recordingState,
  onStartRecording,
  onSubmitRecording,
  onCancelRecording,
  voiceAvailable,
  ttsAvailable,
  ttsEnabled,
  ttsMode,
  onToggleTts,
  onTtsModeChange,
 }: ChatInputWithMentionsProps) {
  return (
    <PromptInputProvider knowledgeFiles={knowledgeFiles} recentFiles={recentFiles} visibleFiles={visibleFiles}>
@ -496,6 +663,18 @@ export function ChatInputWithMentions({
        runId={runId}
        initialDraft={initialDraft}
        onDraftChange={onDraftChange}
        isRecording={isRecording}
        recordingText={recordingText}
        recordingState={recordingState}
        onStartRecording={onStartRecording}
        onSubmitRecording={onSubmitRecording}
        onCancelRecording={onCancelRecording}
        voiceAvailable={voiceAvailable}
        ttsAvailable={ttsAvailable}
        ttsEnabled={ttsEnabled}
        ttsMode={ttsMode}
        onToggleTts={onToggleTts}
        onTtsModeChange={onTtsModeChange}
      />
    </PromptInputProvider>
  )
--- a/apps/x/apps/renderer/src/components/chat-sidebar.tsx
+++ b/apps/x/apps/renderer/src/components/chat-sidebar.tsx
@ -108,6 +108,19 @@ interface ChatSidebarProps {
  onToolOpenChangeForTab?: (tabId: string, toolId: string, open: boolean) => void
  onOpenKnowledgeFile?: (path: string) => void
  onActivate?: () => void
  // Voice / TTS props
  isRecording?: boolean
  recordingText?: string
  recordingState?: 'connecting' | 'listening'
  onStartRecording?: () => void
  onSubmitRecording?: () => void
  onCancelRecording?: () => void
  voiceAvailable?: boolean
  ttsAvailable?: boolean
  ttsEnabled?: boolean
  ttsMode?: 'summary' | 'full'
  onToggleTts?: () => void
  onTtsModeChange?: (mode: 'summary' | 'full') => void
 }
 export function ChatSidebar({
@ -146,6 +159,18 @@ export function ChatSidebar({
  onToolOpenChangeForTab,
  onOpenKnowledgeFile,
  onActivate,
  isRecording,
  recordingText,
  recordingState,
  onStartRecording,
  onSubmitRecording,
  onCancelRecording,
  voiceAvailable,
  ttsAvailable,
  ttsEnabled,
  ttsMode,
  onToggleTts,
  onTtsModeChange,
 }: ChatSidebarProps) {
  const [width, setWidth] = useState(() => getInitialPaneWidth(defaultWidth))
  const [isResizing, setIsResizing] = useState(false)
@ -542,6 +567,18 @@ export function ChatSidebar({
                          runId={tabState.runId}
                          initialDraft={getInitialDraft?.(tab.id)}
                          onDraftChange={onDraftChangeForTab ? (text) => onDraftChangeForTab(tab.id, text) : undefined}
                          isRecording={isActive && isRecording}
                          recordingText={isActive ? recordingText : undefined}
                          recordingState={isActive ? recordingState : undefined}
                          onStartRecording={isActive ? onStartRecording : undefined}
                          onSubmitRecording={isActive ? onSubmitRecording : undefined}
                          onCancelRecording={isActive ? onCancelRecording : undefined}
                          voiceAvailable={isActive && voiceAvailable}
                          ttsAvailable={isActive && ttsAvailable}
                          ttsEnabled={ttsEnabled}
                          ttsMode={ttsMode}
                          onToggleTts={isActive ? onToggleTts : undefined}
                          onTtsModeChange={isActive ? onTtsModeChange : undefined}
                        />
                      </div>
                    )
--- a/apps/x/apps/renderer/src/hooks/useVoiceMode.ts
+++ b/apps/x/apps/renderer/src/hooks/useVoiceMode.ts
@ -0,0 +1,218 @@
 import { useCallback, useEffect, useRef, useState } from 'react';
 export type VoiceState = 'idle' | 'connecting' | 'listening';
 // Cache the API key so we skip the IPC call after first use
 let cachedApiKey: string | null = null;
 let apiKeyFetched = false;
 export function useVoiceMode() {
    const [state, setState] = useState<VoiceState>('idle');
    const [interimText, setInterimText] = useState('');
    const wsRef = useRef<WebSocket | null>(null);
    const mediaStreamRef = useRef<MediaStream | null>(null);
    const processorRef = useRef<ScriptProcessorNode | null>(null);
    const audioCtxRef = useRef<AudioContext | null>(null);
    const transcriptBufferRef = useRef('');
    const interimRef = useRef('');
    const reconnectTimerRef = useRef<ReturnType<typeof setTimeout> | null>(null);
    const mountedRef = useRef(true);
    // Connect (or reconnect) the Deepgram WebSocket.
    // The WS stays open while the hook is mounted; only audio capture starts/stops per recording.
    const connectWs = useCallback(() => {
        if (!cachedApiKey) return;
        if (wsRef.current && (wsRef.current.readyState === WebSocket.OPEN || wsRef.current.readyState === WebSocket.CONNECTING)) return;
        const ws = new WebSocket(
            `wss://api.deepgram.com/v1/listen?model=nova-3&encoding=linear16&sample_rate=16000&channels=1&interim_results=true&smart_format=true&punctuate=true&language=en`,
            ['token', cachedApiKey]
        );
        wsRef.current = ws;
        ws.onopen = () => {
            console.log('[voice] WebSocket connected');
        };
        ws.onmessage = (event) => {
            const data = JSON.parse(event.data);
            if (!data.channel?.alternatives?.[0]) return;
            const transcript = data.channel.alternatives[0].transcript;
            if (!transcript) return;
            if (data.is_final) {
                transcriptBufferRef.current += (transcriptBufferRef.current ? ' ' : '') + transcript;
                interimRef.current = '';
                setInterimText(transcriptBufferRef.current);
            } else {
                interimRef.current = transcript;
                setInterimText(transcriptBufferRef.current + (transcriptBufferRef.current ? ' ' : '') + transcript);
            }
        };
        ws.onerror = () => {
            console.error('[voice] WebSocket error');
        };
        ws.onclose = () => {
            console.log('[voice] WebSocket closed');
            wsRef.current = null;
            // Auto-reconnect after 3 seconds if still mounted
            if (mountedRef.current && cachedApiKey) {
                reconnectTimerRef.current = setTimeout(() => {
                    if (mountedRef.current) connectWs();
                }, 3000);
            }
        };
    }, []);
    // Fetch API key on mount and establish persistent WebSocket
    useEffect(() => {
        mountedRef.current = true;
        const init = async () => {
            if (!apiKeyFetched) {
                apiKeyFetched = true;
                try {
                    const config = await window.ipc.invoke('voice:getConfig', null);
                    cachedApiKey = config.deepgram?.apiKey ?? null;
                } catch { /* ignore */ }
            }
            if (cachedApiKey && mountedRef.current) {
                connectWs();
            }
        };
        void init();
        return () => {
            mountedRef.current = false;
            if (reconnectTimerRef.current) {
                clearTimeout(reconnectTimerRef.current);
                reconnectTimerRef.current = null;
            }
            // Close WS on unmount, suppress reconnect by nulling onclose
            if (wsRef.current) {
                wsRef.current.onclose = null;
                wsRef.current.close();
                wsRef.current = null;
            }
        };
    }, [connectWs]);
    // Stop only audio capture (mic + processor), leaving WS open
    const stopAudioCapture = useCallback(() => {
        if (processorRef.current) {
            processorRef.current.disconnect();
            processorRef.current = null;
        }
        if (audioCtxRef.current) {
            audioCtxRef.current.close();
            audioCtxRef.current = null;
        }
        if (mediaStreamRef.current) {
            mediaStreamRef.current.getTracks().forEach(t => t.stop());
            mediaStreamRef.current = null;
        }
        setInterimText('');
        transcriptBufferRef.current = '';
        interimRef.current = '';
        setState('idle');
    }, []);
    const start = useCallback(async () => {
        if (state !== 'idle') return;
        // Ensure we have an API key
        if (!cachedApiKey) {
            try {
                const config = await window.ipc.invoke('voice:getConfig', null);
                cachedApiKey = config.deepgram?.apiKey ?? null;
            } catch { /* ignore */ }
        }
        if (!cachedApiKey) {
            console.error('Deepgram not configured');
            return;
        }
        transcriptBufferRef.current = '';
        interimRef.current = '';
        setInterimText('');
        // If WS isn't connected, connect and wait for it
        if (!wsRef.current || wsRef.current.readyState !== WebSocket.OPEN) {
            setState('connecting');
            connectWs();
            // Wait for WS to be ready (up to 5 seconds)
            const wsOk = await new Promise<boolean>((resolve) => {
                const checkInterval = setInterval(() => {
                    if (wsRef.current?.readyState === WebSocket.OPEN) {
                        clearInterval(checkInterval);
                        resolve(true);
                    }
                }, 50);
                setTimeout(() => {
                    clearInterval(checkInterval);
                    resolve(false);
                }, 5000);
            });
            if (!wsOk) {
                setState('idle');
                return;
            }
        }
        setState('listening');
        // Start mic
        let stream: MediaStream | null = null;
        try {
            stream = await navigator.mediaDevices.getUserMedia({ audio: true });
        } catch (err) {
            console.error('Microphone access denied:', err);
            setState('idle');
            return;
        }
        mediaStreamRef.current = stream;
        // Start audio capture
        const audioCtx = new AudioContext({ sampleRate: 16000 });
        audioCtxRef.current = audioCtx;
        const source = audioCtx.createMediaStreamSource(stream);
        const processor = audioCtx.createScriptProcessor(4096, 1, 1);
        processorRef.current = processor;
        processor.onaudioprocess = (e) => {
            if (!wsRef.current || wsRef.current.readyState !== WebSocket.OPEN) return;
            const float32 = e.inputBuffer.getChannelData(0);
            const int16 = new Int16Array(float32.length);
            for (let i = 0; i < float32.length; i++) {
                const s = Math.max(-1, Math.min(1, float32[i]));
                int16[i] = s < 0 ? s * 0x8000 : s * 0x7fff;
            }
            wsRef.current.send(int16.buffer);
        };
        source.connect(processor);
        processor.connect(audioCtx.destination);
    }, [state, connectWs]);
    /** Stop recording and return the full transcript (finalized + any current interim) */
    const submit = useCallback((): string => {
        let text = transcriptBufferRef.current;
        if (interimRef.current) {
            text += (text ? ' ' : '') + interimRef.current;
        }
        text = text.trim();
        stopAudioCapture();
        return text;
    }, [stopAudioCapture]);
    /** Cancel recording without returning transcript */
    const cancel = useCallback(() => {
        stopAudioCapture();
    }, [stopAudioCapture]);
    return { state, interimText, start, submit, cancel };
 }
--- a/apps/x/apps/renderer/src/hooks/useVoiceTTS.ts
+++ b/apps/x/apps/renderer/src/hooks/useVoiceTTS.ts
@ -0,0 +1,72 @@
 import { useCallback, useRef, useState } from 'react';
 export type TTSState = 'idle' | 'synthesizing' | 'speaking';
 export function useVoiceTTS() {
    const [state, setState] = useState<TTSState>('idle');
    const audioRef = useRef<HTMLAudioElement | null>(null);
    const queueRef = useRef<string[]>([]);
    const processingRef = useRef(false);
    const processQueue = useCallback(async () => {
        if (processingRef.current) return;
        processingRef.current = true;
        while (queueRef.current.length > 0) {
            const text = queueRef.current.shift()!;
            if (!text.trim()) continue;
            setState('synthesizing');
            console.log('[tts] synthesizing:', text.substring(0, 80));
            try {
                const result = await window.ipc.invoke('voice:synthesize', { text });
                console.log('[tts] got audio, mimeType:', result.mimeType, 'base64 length:', result.audioBase64.length);
                setState('speaking');
                await new Promise<void>((resolve, reject) => {
                    const dataUrl = `data:${result.mimeType};base64,${result.audioBase64}`;
                    const audio = new Audio(dataUrl);
                    audioRef.current = audio;
                    audio.onended = () => {
                        console.log('[tts] audio ended');
                        resolve();
                    };
                    audio.onerror = (e) => {
                        console.error('[tts] audio error:', e);
                        reject(new Error('Audio playback failed'));
                    };
                    audio.play().then(() => {
                        console.log('[tts] audio playing');
                    }).catch((err) => {
                        console.error('[tts] play() rejected:', err);
                        reject(err);
                    });
                });
            } catch (err) {
                console.error('[tts] error:', err);
            }
        }
        audioRef.current = null;
        processingRef.current = false;
        setState('idle');
    }, []);
    const speak = useCallback((text: string) => {
        console.log('[tts] speak() called:', text.substring(0, 80));
        queueRef.current.push(text);
        processQueue();
    }, [processQueue]);
    const cancel = useCallback(() => {
        queueRef.current = [];
        if (audioRef.current) {
            audioRef.current.pause();
            audioRef.current = null;
        }
        processingRef.current = false;
        setState('idle');
    }, []);
    return { state, speak, cancel };
 }
--- a/apps/x/packages/core/src/agents/runtime.ts
+++ b/apps/x/packages/core/src/agents/runtime.ts
@ -894,11 +894,19 @@ export async function* streamAgent({
        }
        // get any queued user messages
        let voiceInput = false;
        let voiceOutput: 'summary' | 'full' | null = null;
        while (true) {
            const msg = await messageQueue.dequeue(runId);
            if (!msg) {
                break;
            }
            if (msg.voiceInput) {
                voiceInput = true;
            }
            if (msg.voiceOutput) {
                voiceOutput = msg.voiceOutput;
            }
            loopLogger.log('dequeued user message', msg.messageId);
            yield* processEvent({
                runId,
@ -938,7 +946,18 @@ export async function* streamAgent({
            minute: '2-digit',
            timeZoneName: 'short'
        });
-        const instructionsWithDateTime = `Current date and time: ${currentDateTime}\n\n${agent.instructions}`;
+        let instructionsWithDateTime = `Current date and time: ${currentDateTime}\n\n${agent.instructions}`;
        if (voiceInput) {
            loopLogger.log('voice input enabled, injecting voice input prompt');
            instructionsWithDateTime += `\n\n# Voice Input\nThe user's message was transcribed from speech. Be aware that:\n- There may be transcription errors. Silently correct obvious ones (e.g. homophones, misheard words). If an error is genuinely ambiguous, briefly mention your interpretation (e.g. "I'm assuming you meant X").\n- Spoken messages are often long-winded. The user may ramble, repeat themselves, or correct something they said earlier in the same message. Focus on their final intent, not every word verbatim.`;
        }
        if (voiceOutput === 'summary') {
            loopLogger.log('voice output enabled (summary mode), injecting voice output prompt');
            instructionsWithDateTime += `\n\n# Voice Output (MANDATORY)\nThe user has voice output enabled. You MUST start your response with <voice></voice> tags that provide a spoken summary and guide to your written response. This is NOT optional — every response MUST begin with <voice> tags.\n\nRules:\n1. ALWAYS start your response with one or more <voice> tags. Never skip them.\n2. Place ALL <voice> tags at the BEGINNING of your response, before any detailed content. Do NOT intersperse <voice> tags throughout the response.\n3. Wrap EACH spoken sentence in its own separate <voice> tag so it can be spoken incrementally. Do NOT wrap everything in a single <voice> block.\n4. Use voice as a TL;DR and navigation aid — do NOT read the entire response aloud.\n\nExample — if the user asks "what happened in my meeting with Sarah yesterday?":\n<voice>Your meeting with Sarah covered three main things: the Q2 roadmap timeline, hiring for the backend role, and the client demo next week.</voice>\n<voice>I've pulled out the key details and action items below — the demo prep notes are at the end.</voice>\n\n## Meeting with Sarah — March 11\n(Then the full detailed written response follows without any more <voice> tags.)\n\nAny text outside <voice> tags is shown visually but not spoken.`;
        } else if (voiceOutput === 'full') {
            loopLogger.log('voice output enabled (full mode), injecting voice output prompt');
            instructionsWithDateTime += `\n\n# Voice Output — Full Read-Aloud (MANDATORY)\nThe user wants your ENTIRE response spoken aloud. You MUST wrap your full response in <voice></voice> tags. This is NOT optional.\n\nRules:\n1. Wrap EACH sentence in its own separate <voice> tag so it can be spoken incrementally.\n2. Write your response in a natural, conversational style suitable for listening — no markdown headings, bullet points, or formatting symbols. Use plain spoken language.\n3. Structure the content as if you are speaking to the user directly. Use transitions like "first", "also", "one more thing" instead of visual formatting.\n4. Every sentence MUST be inside a <voice> tag. Do not leave any content outside <voice> tags.\n\nExample:\n<voice>Your meeting with Sarah covered three main things.</voice>\n<voice>First, you discussed the Q2 roadmap timeline and agreed to push the launch to April.</voice>\n<voice>Second, you talked about hiring for the backend role — Sarah will send over two candidates by Friday.</voice>\n<voice>And lastly, the client demo is next week on Thursday at 2pm, and you're handling the intro slides.</voice>`;
        }
        let streamError: string | null = null;
        for await (const event of streamLlm(
            model,
--- a/apps/x/packages/core/src/application/assistant/instructions.ts
+++ b/apps/x/packages/core/src/application/assistant/instructions.ts
@ -33,6 +33,8 @@ Rowboat is an agentic assistant for everyday work - emails, meetings, projects,
 **Document Collaboration:** When users ask you to work on a document, collaborate on writing, create a new document, edit/refine existing notes, or say things like "let's work on [X]", "help me write [X]", "create a doc for [X]", or "let's draft [X]", you MUST load the \`doc-collab\` skill first. This is required for any document creation or editing task. The skill provides structured guidance for creating, editing, and refining documents in the knowledge base.
 **App Control:** When users ask you to open notes, show the bases or graph view, filter or search notes, or manage saved views, load the \`app-navigation\` skill first. It provides structured guidance for navigating the app UI and controlling the knowledge base view.
 **Slack:** When users ask about Slack messages, want to send messages to teammates, check channel conversations, or find someone on Slack, load the \`slack\` skill. You can send messages, view channel history, search conversations, and find users. Always show message drafts to the user before sending.
 ## Memory That Compounds
@ -184,6 +186,7 @@ ${runtimeContextPrompt}
 - \`loadSkill\` - Skill loading
 - \`slack-checkConnection\`, \`slack-listAvailableTools\`, \`slack-executeAction\` - Slack integration (requires Slack to be connected via Composio). Use \`slack-listAvailableTools\` first to discover available tool slugs, then \`slack-executeAction\` to execute them.
 - \`web-search\` and \`research-search\` - Web and research search tools (available when configured). **You MUST load the \`web-search\` skill before using either of these tools.** It tells you which tool to pick and how many searches to do.
 - \`app-navigation\` - Control the app UI: open notes, switch views, filter/search the knowledge base, manage saved views. **Load the \`app-navigation\` skill before using this tool.**
 **Prefer these tools whenever possible** — they work instantly with zero friction. For file operations inside \`~/.rowboat/\`, always use these instead of \`executeCommand\`.
--- a/apps/x/packages/core/src/application/assistant/skills/app-navigation/skill.ts
+++ b/apps/x/packages/core/src/application/assistant/skills/app-navigation/skill.ts
@ -44,6 +44,7 @@ Change filters, columns, sort order, or search in the bases (table) view.
 - If unsure what categories/values are available, call ` + "`get-base-state`" + ` first.
 - For "show me X", prefer ` + "`filters.set`" + ` to start fresh rather than ` + "`filters.add`" + `.
 - Categories come from frontmatter keys (e.g., relationship, status, topic, type).
 - **CRITICAL: Do NOT pass ` + "`columns`" + ` unless the user explicitly asks to show/hide specific columns.** Omit the ` + "`columns`" + ` parameter entirely when only filtering, sorting, or searching. Passing ` + "`columns`" + ` will override the user's current column layout and can make the view appear empty.
 ### get-base-state
 Retrieve information about what's in the knowledge base — available filter categories, values, and note count.
@ -75,6 +76,7 @@ Save the current view configuration as a named base.
 - The ` + "`update-base-view`" + ` action will automatically navigate to the bases view if the user isn't already there.
 - ` + "`open-note`" + ` validates that the file exists before navigating.
 - Filter categories and values come from frontmatter in knowledge files.
 - **Never send ` + "`columns`" + ` or ` + "`sort`" + ` with ` + "`update-base-view`" + ` unless the user specifically asks to change them.** Only pass the parameters you intend to change — omitted parameters are left untouched.
 `;
 export default skill;
--- a/apps/x/packages/core/src/application/lib/builtin-tools.ts
+++ b/apps/x/packages/core/src/application/lib/builtin-tools.ts
@ -884,6 +884,145 @@ export const BuiltinTools: z.infer<typeof BuiltinToolsSchema> = {
        },
    },
    // ============================================================================
    // App Navigation
    // ============================================================================
    'app-navigation': {
        description: 'Control the app UI - navigate to notes, switch views, filter/search the knowledge base, and manage saved views.',
        inputSchema: z.object({
            action: z.enum(["open-note", "open-view", "update-base-view", "get-base-state", "create-base"]).describe("The navigation action to perform"),
            // open-note
            path: z.string().optional().describe("Knowledge file path for open-note, e.g. knowledge/People/John.md"),
            // open-view
            view: z.enum(["bases", "graph"]).optional().describe("Which view to open (for open-view action)"),
            // update-base-view
            filters: z.object({
                set: z.array(z.object({ category: z.string(), value: z.string() })).optional().describe("Replace all filters with these"),
                add: z.array(z.object({ category: z.string(), value: z.string() })).optional().describe("Add these filters"),
                remove: z.array(z.object({ category: z.string(), value: z.string() })).optional().describe("Remove these filters"),
                clear: z.boolean().optional().describe("Clear all filters"),
            }).optional().describe("Filter modifications (for update-base-view)"),
            columns: z.object({
                set: z.array(z.string()).optional().describe("Replace visible columns with these"),
                add: z.array(z.string()).optional().describe("Add these columns"),
                remove: z.array(z.string()).optional().describe("Remove these columns"),
            }).optional().describe("Column modifications (for update-base-view)"),
            sort: z.object({
                field: z.string(),
                dir: z.enum(["asc", "desc"]),
            }).optional().describe("Sort configuration (for update-base-view)"),
            search: z.string().optional().describe("Search query to filter notes (for update-base-view)"),
            // get-base-state
            base_name: z.string().optional().describe("Name of a saved base to inspect (for get-base-state). Omit for the current/default view."),
            // create-base
            name: z.string().optional().describe("Name for the saved base view (for create-base)"),
        }),
        execute: async (input: {
            action: string;
            [key: string]: unknown;
        }) => {
            switch (input.action) {
                case 'open-note': {
                    const filePath = input.path as string;
                    try {
                        const result = await workspace.exists(filePath);
                        if (!result.exists) {
                            return { success: false, error: `File not found: ${filePath}` };
                        }
                        return { success: true, action: 'open-note', path: filePath };
                    } catch {
                        return { success: false, error: `Could not access file: ${filePath}` };
                    }
                }
                case 'open-view': {
                    const view = input.view as string;
                    return { success: true, action: 'open-view', view };
                }
                case 'update-base-view': {
                    const updates: Record<string, unknown> = {};
                    if (input.filters) updates.filters = input.filters;
                    if (input.columns) updates.columns = input.columns;
                    if (input.sort) updates.sort = input.sort;
                    if (input.search !== undefined) updates.search = input.search;
                    return { success: true, action: 'update-base-view', updates };
                }
                case 'get-base-state': {
                    // Scan knowledge/ files and extract frontmatter properties
                    try {
                        const { parseFrontmatter } = await import("@x/shared/dist/frontmatter.js");
                        const entries = await workspace.readdir("knowledge", { recursive: true, allowedExtensions: [".md"] });
                        const files = entries.filter(e => e.kind === 'file');
                        const properties = new Map<string, Set<string>>();
                        let noteCount = 0;
                        for (const file of files) {
                            try {
                                const { data } = await workspace.readFile(file.path);
                                const { fields } = parseFrontmatter(data);
                                noteCount++;
                                for (const [key, value] of Object.entries(fields)) {
                                    if (!value) continue;
                                    let set = properties.get(key);
                                    if (!set) { set = new Set(); properties.set(key, set); }
                                    const values = Array.isArray(value) ? value : [value];
                                    for (const v of values) {
                                        const trimmed = v.trim();
                                        if (trimmed) set.add(trimmed);
                                    }
                                }
                            } catch {
                                // skip unreadable files
                            }
                        }
                        const availableProperties: Record<string, string[]> = {};
                        for (const [key, values] of properties) {
                            availableProperties[key] = [...values].sort();
                        }
                        return {
                            success: true,
                            action: 'get-base-state',
                            noteCount,
                            availableProperties,
                        };
                    } catch (error) {
                        return {
                            success: false,
                            error: error instanceof Error ? error.message : 'Failed to read knowledge base',
                        };
                    }
                }
                case 'create-base': {
                    const name = input.name as string;
                    const safeName = name.replace(/[^a-zA-Z0-9_\- ]/g, '').trim();
                    if (!safeName) {
                        return { success: false, error: 'Invalid base name' };
                    }
                    const basePath = `bases/${safeName}.base`;
                    try {
                        const config = { name: safeName, filters: [], columns: [] };
                        await workspace.writeFile(basePath, JSON.stringify(config, null, 2), { mkdirp: true });
                        return { success: true, action: 'create-base', name: safeName, path: basePath };
                    } catch (error) {
                        return {
                            success: false,
                            error: error instanceof Error ? error.message : 'Failed to create base',
                        };
                    }
                }
                default:
                    return { success: false, error: `Unknown action: ${input.action}` };
            }
        },
    },
    // ============================================================================
    // Web Search (Brave Search API)
    // ============================================================================
--- a/apps/x/packages/core/src/application/lib/message-queue.ts
+++ b/apps/x/packages/core/src/application/lib/message-queue.ts
@ -3,14 +3,17 @@ import { UserMessageContent } from "@x/shared/dist/message.js";
 import z from "zod";
 export type UserMessageContentType = z.infer<typeof UserMessageContent>;
 export type VoiceOutputMode = 'summary' | 'full';
 type EnqueuedMessage = {
    messageId: string;
    message: UserMessageContentType;
    voiceInput?: boolean;
    voiceOutput?: VoiceOutputMode;
 };
 export interface IMessageQueue {
-    enqueue(runId: string, message: UserMessageContentType): Promise<string>;
+    enqueue(runId: string, message: UserMessageContentType, voiceInput?: boolean, voiceOutput?: VoiceOutputMode): Promise<string>;
    dequeue(runId: string): Promise<EnqueuedMessage | null>;
 }
@ -26,7 +29,7 @@ export class InMemoryMessageQueue implements IMessageQueue {
        this.idGenerator = idGenerator;
    }
-    async enqueue(runId: string, message: UserMessageContentType): Promise<string> {
+    async enqueue(runId: string, message: UserMessageContentType, voiceInput?: boolean, voiceOutput?: VoiceOutputMode): Promise<string> {
        if (!this.store[runId]) {
            this.store[runId] = [];
        }
@ -34,6 +37,8 @@ export class InMemoryMessageQueue implements IMessageQueue {
        this.store[runId].push({
            messageId: id,
            message,
            voiceInput,
            voiceOutput,
        });
        return id;
    }
@ -44,4 +49,4 @@ export class InMemoryMessageQueue implements IMessageQueue {
        }
        return this.store[runId].shift() ?? null;
    }
-}
+}
--- a/apps/x/packages/core/src/index.ts
+++ b/apps/x/packages/core/src/index.ts
@ -9,3 +9,6 @@ export { initConfigs } from './config/initConfigs.js';
 // Knowledge version history
 export * as versionHistory from './knowledge/version_history.js';
 // Voice mode (config + TTS)
 export * as voice from './voice/voice.js';
--- a/apps/x/packages/core/src/models/models.ts
+++ b/apps/x/packages/core/src/models/models.ts
@ -64,7 +64,7 @@ export function createProvider(config: z.infer<typeof Provider>): ProviderV2 {
                apiKey,
                baseURL,
                headers,
-            });
+            }) as unknown as ProviderV2;
        default:
            throw new Error(`Unsupported provider flavor: ${config.flavor}`);
    }
--- a/apps/x/packages/core/src/runs/runs.ts
+++ b/apps/x/packages/core/src/runs/runs.ts
@ -1,6 +1,6 @@
 import z from "zod";
 import container from "../di/container.js";
-import { IMessageQueue, UserMessageContentType } from "../application/lib/message-queue.js";
+import { IMessageQueue, UserMessageContentType, VoiceOutputMode } from "../application/lib/message-queue.js";
 import { AskHumanResponseEvent, ToolPermissionRequestEvent, ToolPermissionResponseEvent, CreateRunOptions, Run, ListRunsResponse, ToolPermissionAuthorizePayload, AskHumanResponsePayload } from "@x/shared/dist/runs.js";
 import { IRunsRepo } from "./repo.js";
 import { IAgentRuntime } from "../agents/runtime.js";
@ -19,9 +19,9 @@ export async function createRun(opts: z.infer<typeof CreateRunOptions>): Promise
    return run;
 }
-export async function createMessage(runId: string, message: UserMessageContentType): Promise<string> {
+export async function createMessage(runId: string, message: UserMessageContentType, voiceInput?: boolean, voiceOutput?: VoiceOutputMode): Promise<string> {
    const queue = container.resolve<IMessageQueue>('messageQueue');
-    const id = await queue.enqueue(runId, message);
+    const id = await queue.enqueue(runId, message, voiceInput, voiceOutput);
    const runtime = container.resolve<IAgentRuntime>('agentRuntime');
    runtime.trigger(runId);
    return id;
--- a/apps/x/packages/core/src/voice/voice.ts
+++ b/apps/x/packages/core/src/voice/voice.ts
@ -0,0 +1,70 @@
 import * as fs from 'fs/promises';
 import * as path from 'path';
 const homedir = process.env.HOME || process.env.USERPROFILE || '';
 export interface VoiceConfig {
    deepgram: { apiKey: string } | null;
    elevenlabs: { apiKey: string; voiceId?: string } | null;
 }
 async function readJsonConfig(filename: string): Promise<Record<string, unknown> | null> {
    try {
        const configPath = path.join(homedir, '.rowboat', 'config', filename);
        const raw = await fs.readFile(configPath, 'utf8');
        return JSON.parse(raw);
    } catch {
        return null;
    }
 }
 export async function getVoiceConfig(): Promise<VoiceConfig> {
    const dgConfig = await readJsonConfig('deepgram.json');
    const elConfig = await readJsonConfig('elevenlabs.json');
    return {
        deepgram: dgConfig?.apiKey ? { apiKey: dgConfig.apiKey as string } : null,
        elevenlabs: elConfig?.apiKey
            ? { apiKey: elConfig.apiKey as string, voiceId: elConfig.voiceId as string | undefined }
            : null,
    };
 }
 export async function synthesizeSpeech(text: string): Promise<{ audioBase64: string; mimeType: string }> {
    const config = await getVoiceConfig();
    if (!config.elevenlabs) {
        throw new Error('ElevenLabs not configured. Create ~/.rowboat/config/elevenlabs.json with { "apiKey": "<your-key>" }');
    }
    const voiceId = config.elevenlabs.voiceId || 'UgBBYS2sOqTuMpoF3BR0';
    const url = `https://api.elevenlabs.io/v1/text-to-speech/${voiceId}`;
    console.log('[voice] synthesizing speech, text length:', text.length, 'voiceId:', voiceId);
    const response = await fetch(url, {
        method: 'POST',
        headers: {
            'xi-api-key': config.elevenlabs.apiKey,
            'Content-Type': 'application/json',
        },
        body: JSON.stringify({
            text,
            model_id: 'eleven_multilingual_v2',
            voice_settings: {
                stability: 0.5,
                similarity_boost: 0.75,
            },
        }),
    });
    if (!response.ok) {
        const errText = await response.text().catch(() => 'Unknown error');
        console.error('[voice] ElevenLabs API error:', response.status, errText);
        throw new Error(`ElevenLabs API error ${response.status}: ${errText}`);
    }
    const arrayBuffer = await response.arrayBuffer();
    const audioBase64 = Buffer.from(arrayBuffer).toString('base64');
    console.log('[voice] synthesized audio, base64 length:', audioBase64.length);
    return { audioBase64, mimeType: 'audio/mpeg' };
 }
--- a/apps/x/packages/shared/src/ipc.ts
+++ b/apps/x/packages/shared/src/ipc.ts
@ -130,6 +130,8 @@ const ipcSchemas = {
    req: z.object({
      runId: z.string(),
      message: UserMessageContent,
      voiceInput: z.boolean().optional(),
      voiceOutput: z.enum(['summary', 'full']).optional(),
    }),
    res: z.object({
      messageId: z.string(),
@ -460,6 +462,23 @@ const ipcSchemas = {
      })),
    }),
  },
  // Voice mode channels
  'voice:getConfig': {
    req: z.null(),
    res: z.object({
      deepgram: z.object({ apiKey: z.string() }).nullable(),
      elevenlabs: z.object({ apiKey: z.string(), voiceId: z.string().optional() }).nullable(),
    }),
  },
  'voice:synthesize': {
    req: z.object({
      text: z.string(),
    }),
    res: z.object({
      audioBase64: z.string(),
      mimeType: z.string(),
    }),
  },
  // Inline task schedule classification
  'inline-task:classifySchedule': {
    req: z.object({