voice mode with TTS input/output

This commit is contained in:
Arjun 2026-03-13 10:26:08 +05:30
parent e730c118dc
commit 8db1a091f0
17 changed files with 937 additions and 15 deletions

View file

@ -38,7 +38,7 @@ import { IAgentScheduleRepo } from '@x/core/dist/agent-schedule/repo.js';
import { IAgentScheduleStateRepo } from '@x/core/dist/agent-schedule/state-repo.js';
import { triggerRun as triggerAgentScheduleRun } from '@x/core/dist/agent-schedule/runner.js';
import { search } from '@x/core/dist/search/search.js';
import { versionHistory } from '@x/core';
import { versionHistory, voice } from '@x/core';
import { classifySchedule } from '@x/core/dist/knowledge/inline_tasks.js';
type InvokeChannels = ipc.InvokeChannels;
@ -352,7 +352,7 @@ export function setupIpcHandlers() {
return runsCore.createRun(args);
},
'runs:createMessage': async (_event, args) => {
return { messageId: await runsCore.createMessage(args.runId, args.message) };
return { messageId: await runsCore.createMessage(args.runId, args.message, args.voiceInput, args.voiceOutput) };
},
'runs:authorizePermission': async (_event, args) => {
await runsCore.authorizePermission(args.runId, args.authorization);
@ -571,5 +571,11 @@ export function setupIpcHandlers() {
const schedule = await classifySchedule(args.instruction);
return { schedule };
},
'voice:getConfig': async () => {
return voice.getVoiceConfig();
},
'voice:synthesize': async (_event, args) => {
return voice.synthesizeSpeech(args.text);
},
});
}

View file

@ -1,4 +1,4 @@
import { app, BrowserWindow, protocol, net, shell } from "electron";
import { app, BrowserWindow, protocol, net, shell, session } from "electron";
import path from "node:path";
import {
setupIpcHandlers,
@ -92,6 +92,15 @@ function createWindow() {
},
});
// Grant microphone permission for voice mode
session.defaultSession.setPermissionRequestHandler((_webContents, permission, callback) => {
if (permission === 'media') {
callback(true);
} else {
callback(false);
}
});
// Show window when content is ready to prevent blank screen
win.once("ready-to-show", () => {
win.show();

View file

@ -76,6 +76,8 @@ import {
import { AgentScheduleConfig } from '@x/shared/dist/agent-schedule.js'
import { AgentScheduleState } from '@x/shared/dist/agent-schedule-state.js'
import { toast } from "sonner"
import { useVoiceMode } from '@/hooks/useVoiceMode'
import { useVoiceTTS } from '@/hooks/useVoiceTTS'
type DirEntry = z.infer<typeof workspace.DirEntry>
type RunEventType = z.infer<typeof RunEvent>
@ -546,6 +548,87 @@ function App() {
const [agentId] = useState<string>('copilot')
const [presetMessage, setPresetMessage] = useState<string | undefined>(undefined)
// Voice mode state
const [voiceAvailable, setVoiceAvailable] = useState(false)
const [ttsAvailable, setTtsAvailable] = useState(false)
const [ttsEnabled, setTtsEnabled] = useState(false)
const ttsEnabledRef = useRef(false)
const [ttsMode, setTtsMode] = useState<'summary' | 'full'>('summary')
const ttsModeRef = useRef<'summary' | 'full'>('summary')
const [isRecording, setIsRecording] = useState(false)
const voiceTextBufferRef = useRef('')
const spokenIndexRef = useRef(0)
const isRecordingRef = useRef(false)
const tts = useVoiceTTS()
const ttsRef = useRef(tts)
ttsRef.current = tts
const voice = useVoiceMode()
const voiceRef = useRef(voice)
voiceRef.current = voice
// Check if voice is available on mount
useEffect(() => {
window.ipc.invoke('voice:getConfig', null).then(config => {
setVoiceAvailable(!!config.deepgram)
setTtsAvailable(!!config.elevenlabs)
}).catch(() => {
setVoiceAvailable(false)
setTtsAvailable(false)
})
}, [])
const handleStartRecording = useCallback(() => {
setIsRecording(true)
isRecordingRef.current = true
voice.start()
}, [voice])
const handlePromptSubmitRef = useRef<((msg: { text: string }) => void) | null>(null)
const pendingVoiceInputRef = useRef(false)
const handleSubmitRecording = useCallback(() => {
const text = voice.submit()
setIsRecording(false)
isRecordingRef.current = false
if (text) {
pendingVoiceInputRef.current = true
handlePromptSubmitRef.current?.({ text })
}
}, [voice])
const handleToggleTts = useCallback(() => {
setTtsEnabled(prev => {
const next = !prev
ttsEnabledRef.current = next
if (!next) {
ttsRef.current.cancel()
}
return next
})
}, [])
const handleTtsModeChange = useCallback((mode: 'summary' | 'full') => {
setTtsMode(mode)
ttsModeRef.current = mode
}, [])
const handleCancelRecording = useCallback(() => {
voice.cancel()
setIsRecording(false)
isRecordingRef.current = false
}, [voice])
// Helper to cancel recording from any navigation handler
const cancelRecordingIfActive = useCallback(() => {
if (isRecordingRef.current) {
voiceRef.current.cancel()
setIsRecording(false)
isRecordingRef.current = false
}
}, [])
// Runs history state
type RunListItem = { id: string; title?: string; createdAt: string; agentId: string }
const [runs, setRuns] = useState<RunListItem[]>([])
@ -1496,6 +1579,9 @@ function App() {
if (!isActiveRun) return
setIsProcessing(true)
setModelUsage(null)
// Reset voice buffer for new response
voiceTextBufferRef.current = ''
spokenIndexRef.current = 0
break
case 'run-processing-end':
@ -1545,6 +1631,20 @@ function App() {
if (llmEvent.type === 'text-delta' && llmEvent.delta) {
appendStreamingBuffer(event.runId, llmEvent.delta)
setCurrentAssistantMessage(prev => prev + llmEvent.delta)
// Extract <voice> tags and send to TTS when enabled
voiceTextBufferRef.current += llmEvent.delta
const remaining = voiceTextBufferRef.current.substring(spokenIndexRef.current)
const voiceRegex = /<voice>([\s\S]*?)<\/voice>/g
let voiceMatch: RegExpExecArray | null
while ((voiceMatch = voiceRegex.exec(remaining)) !== null) {
const voiceContent = voiceMatch[1].trim()
console.log('[voice] extracted voice tag:', voiceContent)
if (voiceContent && ttsEnabledRef.current) {
ttsRef.current.speak(voiceContent)
}
spokenIndexRef.current += voiceMatch.index + voiceMatch[0].length
}
} else if (llmEvent.type === 'tool-call') {
setConversation(prev => [...prev, {
id: llmEvent.toolCallId || `tool-${Date.now()}`,
@ -1584,6 +1684,7 @@ function App() {
if (msg.role === 'assistant') {
setCurrentAssistantMessage(currentMsg => {
if (currentMsg) {
const cleanedContent = currentMsg.replace(/<\/?voice>/g, '')
setConversation(prev => {
const exists = prev.some(m =>
m.id === event.messageId && 'role' in m && m.role === 'assistant'
@ -1592,7 +1693,7 @@ function App() {
return [...prev, {
id: event.messageId,
role: 'assistant',
content: currentMsg,
content: cleanedContent,
timestamp: Date.now(),
}]
})
@ -1887,6 +1988,8 @@ function App() {
await window.ipc.invoke('runs:createMessage', {
runId: currentRunId,
message: attachmentPayload,
voiceInput: pendingVoiceInputRef.current || undefined,
voiceOutput: ttsEnabledRef.current ? ttsModeRef.current : undefined,
})
} else {
// Legacy path: plain string with optional XML-formatted @mentions.
@ -1915,11 +2018,15 @@ function App() {
await window.ipc.invoke('runs:createMessage', {
runId: currentRunId,
message: formattedMessage,
voiceInput: pendingVoiceInputRef.current || undefined,
voiceOutput: ttsEnabledRef.current ? ttsModeRef.current : undefined,
})
titleSource = formattedMessage
}
pendingVoiceInputRef.current = false
if (isNewRun) {
const inferredTitle = inferRunTitleFromMessage(titleSource)
setRuns((prev) => {
@ -1936,6 +2043,7 @@ function App() {
console.error('Failed to send message:', error)
}
}
handlePromptSubmitRef.current = handlePromptSubmit
const handleStop = useCallback(async () => {
if (!runId) return
@ -2065,6 +2173,7 @@ function App() {
}, [])
const openChatInNewTab = useCallback((targetRunId: string) => {
cancelRecordingIfActive()
const existingTab = chatTabs.find(t => t.runId === targetRunId)
if (existingTab) {
// Cancel stale in-flight loads from previously focused tabs.
@ -2080,12 +2189,18 @@ function App() {
setChatTabs(prev => [...prev, { id, runId: targetRunId }])
setActiveChatTabId(id)
loadRun(targetRunId)
}, [chatTabs, loadRun, restoreChatTabState])
}, [chatTabs, loadRun, restoreChatTabState, cancelRecordingIfActive])
const switchChatTab = useCallback((tabId: string) => {
const tab = chatTabs.find(t => t.id === tabId)
if (!tab) return
if (tabId === activeChatTabId) return
// Cancel any active recording when switching tabs
if (isRecordingRef.current) {
voiceRef.current.cancel()
setIsRecording(false)
isRecordingRef.current = false
}
saveChatScrollForTab(activeChatTabId)
// Cancel stale in-flight loads from previously focused tabs.
loadRunRequestIdRef.current += 1
@ -2471,13 +2586,14 @@ function App() {
const current = currentViewState
if (viewStatesEqual(current, nextView)) return
cancelRecordingIfActive()
const nextHistory = {
back: appendUnique(historyRef.current.back, current),
forward: [] as ViewState[],
}
setHistory(nextHistory)
await applyViewState(nextView)
}, [appendUnique, applyViewState, currentViewState, setHistory])
}, [appendUnique, applyViewState, cancelRecordingIfActive, currentViewState, setHistory])
const navigateBack = useCallback(async () => {
const { back, forward } = historyRef.current
@ -3412,6 +3528,7 @@ function App() {
tasksActions={{
onNewChat: handleNewChatTab,
onSelectRun: (runIdToLoad) => {
cancelRecordingIfActive()
if (selectedPath || isGraphOpen) {
setIsChatSidebarOpen(true)
}
@ -3814,7 +3931,7 @@ function App() {
{tabState.currentAssistantMessage && (
<Message from="assistant">
<MessageContent>
<MessageResponse components={streamdownComponents}>{tabState.currentAssistantMessage}</MessageResponse>
<MessageResponse components={streamdownComponents}>{tabState.currentAssistantMessage.replace(/<\/?voice>/g, '')}</MessageResponse>
</MessageContent>
</Message>
)}
@ -3865,6 +3982,18 @@ function App() {
runId={tabState.runId}
initialDraft={chatDraftsRef.current.get(tab.id)}
onDraftChange={(text) => setChatDraftForTab(tab.id, text)}
isRecording={isActive && isRecording}
recordingText={isActive ? voice.interimText : undefined}
recordingState={isActive ? (voice.state === 'connecting' ? 'connecting' : 'listening') : undefined}
onStartRecording={isActive ? handleStartRecording : undefined}
onSubmitRecording={isActive ? handleSubmitRecording : undefined}
onCancelRecording={isActive ? handleCancelRecording : undefined}
voiceAvailable={isActive && voiceAvailable}
ttsAvailable={isActive && ttsAvailable}
ttsEnabled={ttsEnabled}
ttsMode={ttsMode}
onToggleTts={isActive ? handleToggleTts : undefined}
onTtsModeChange={isActive ? handleTtsModeChange : undefined}
/>
</div>
)
@ -3914,6 +4043,18 @@ function App() {
onToolOpenChangeForTab={setToolOpenForTab}
onOpenKnowledgeFile={(path) => { navigateToFile(path) }}
onActivate={() => setActiveShortcutPane('right')}
isRecording={isRecording}
recordingText={voice.interimText}
recordingState={voice.state === 'connecting' ? 'connecting' : 'listening'}
onStartRecording={handleStartRecording}
onSubmitRecording={handleSubmitRecording}
onCancelRecording={handleCancelRecording}
voiceAvailable={voiceAvailable}
ttsAvailable={ttsAvailable}
ttsEnabled={ttsEnabled}
ttsMode={ttsMode}
onToggleTts={handleToggleTts}
onTtsModeChange={handleTtsModeChange}
/>
)}
{/* Rendered last so its no-drag region paints over the sidebar drag region */}

View file

@ -1,4 +1,5 @@
import { useCallback, useEffect, useRef, useState } from 'react'
import { Tooltip, TooltipContent, TooltipTrigger } from '@/components/ui/tooltip'
import {
ArrowUp,
AudioLines,
@ -9,7 +10,9 @@ import {
FileSpreadsheet,
FileText,
FileVideo,
Headphones,
LoaderIcon,
Mic,
Plus,
Square,
X,
@ -102,6 +105,18 @@ interface ChatInputInnerProps {
runId?: string | null
initialDraft?: string
onDraftChange?: (text: string) => void
isRecording?: boolean
recordingText?: string
recordingState?: 'connecting' | 'listening'
onStartRecording?: () => void
onSubmitRecording?: () => void
onCancelRecording?: () => void
voiceAvailable?: boolean
ttsAvailable?: boolean
ttsEnabled?: boolean
ttsMode?: 'summary' | 'full'
onToggleTts?: () => void
onTtsModeChange?: (mode: 'summary' | 'full') => void
}
function ChatInputInner({
@ -115,6 +130,18 @@ function ChatInputInner({
runId,
initialDraft,
onDraftChange,
isRecording,
recordingText,
recordingState,
onStartRecording,
onSubmitRecording,
onCancelRecording,
voiceAvailable,
ttsAvailable,
ttsEnabled,
ttsMode,
onToggleTts,
onTtsModeChange,
}: ChatInputInnerProps) {
const controller = usePromptInputController()
const message = controller.textInput.value
@ -367,6 +394,40 @@ function ChatInputInner({
e.target.value = ''
}}
/>
{isRecording ? (
/* ── Recording bar ── */
<div className="flex items-center gap-3 px-4 py-3">
<button
type="button"
onClick={onCancelRecording}
className="flex h-7 w-7 shrink-0 items-center justify-center rounded-full text-muted-foreground transition-colors hover:bg-muted hover:text-foreground"
aria-label="Cancel recording"
>
<X className="h-4 w-4" />
</button>
<div className="flex flex-1 items-center gap-2 overflow-hidden">
<VoiceWaveform />
<span className="min-w-0 flex-1 truncate text-sm text-muted-foreground">
{recordingState === 'connecting' ? 'Connecting...' : recordingText || 'Listening...'}
</span>
</div>
<Button
size="icon"
onClick={onSubmitRecording}
disabled={!recordingText?.trim()}
className={cn(
'h-7 w-7 shrink-0 rounded-full transition-all',
recordingText?.trim()
? 'bg-primary text-primary-foreground hover:bg-primary/90'
: 'bg-muted text-muted-foreground'
)}
>
<ArrowUp className="h-4 w-4" />
</Button>
</div>
) : (
/* ── Normal input ── */
<>
<div className="px-4 pt-4 pb-2">
<PromptInputTextarea
placeholder="Type your message..."
@ -414,6 +475,63 @@ function ChatInputInner({
</DropdownMenuContent>
</DropdownMenu>
)}
{onToggleTts && ttsAvailable && (
<div className="flex shrink-0 items-center">
<Tooltip>
<TooltipTrigger asChild>
<button
type="button"
onClick={onToggleTts}
className={cn(
'relative flex h-7 w-7 shrink-0 items-center justify-center rounded-full transition-colors',
ttsEnabled
? 'text-foreground hover:bg-muted'
: 'text-muted-foreground hover:bg-muted hover:text-foreground'
)}
aria-label={ttsEnabled ? 'Disable voice output' : 'Enable voice output'}
>
<Headphones className="h-4 w-4" />
{!ttsEnabled && (
<span className="absolute inset-0 flex items-center justify-center pointer-events-none">
<span className="block h-[1.5px] w-5 -rotate-45 rounded-full bg-muted-foreground" />
</span>
)}
</button>
</TooltipTrigger>
<TooltipContent side="top">
{ttsEnabled ? 'Voice output on' : 'Voice output off'}
</TooltipContent>
</Tooltip>
{ttsEnabled && onTtsModeChange && (
<DropdownMenu>
<DropdownMenuTrigger asChild>
<button
type="button"
className="flex h-7 w-4 shrink-0 items-center justify-center text-muted-foreground transition-colors hover:text-foreground"
>
<ChevronDown className="h-3 w-3" />
</button>
</DropdownMenuTrigger>
<DropdownMenuContent align="end">
<DropdownMenuRadioGroup value={ttsMode ?? 'summary'} onValueChange={(v) => onTtsModeChange(v as 'summary' | 'full')}>
<DropdownMenuRadioItem value="summary">Speak summary</DropdownMenuRadioItem>
<DropdownMenuRadioItem value="full">Speak full response</DropdownMenuRadioItem>
</DropdownMenuRadioGroup>
</DropdownMenuContent>
</DropdownMenu>
)}
</div>
)}
{voiceAvailable && onStartRecording && (
<button
type="button"
onClick={onStartRecording}
className="flex h-7 w-7 shrink-0 items-center justify-center rounded-full text-muted-foreground transition-colors hover:bg-muted hover:text-foreground"
aria-label="Voice input"
>
<Mic className="h-4 w-4" />
</button>
)}
{isProcessing ? (
<Button
size="icon"
@ -448,6 +566,31 @@ function ChatInputInner({
</Button>
)}
</div>
</>
)}
</div>
)
}
/** Animated waveform bars for the recording indicator */
function VoiceWaveform() {
return (
<div className="flex items-center gap-[3px] h-5">
{[0, 1, 2, 3, 4].map((i) => (
<span
key={i}
className="w-[3px] rounded-full bg-primary"
style={{
animation: `voice-wave 1.2s ease-in-out ${i * 0.15}s infinite`,
}}
/>
))}
<style>{`
@keyframes voice-wave {
0%, 100% { height: 4px; }
50% { height: 16px; }
}
`}</style>
</div>
)
}
@ -466,6 +609,18 @@ export interface ChatInputWithMentionsProps {
runId?: string | null
initialDraft?: string
onDraftChange?: (text: string) => void
isRecording?: boolean
recordingText?: string
recordingState?: 'connecting' | 'listening'
onStartRecording?: () => void
onSubmitRecording?: () => void
onCancelRecording?: () => void
voiceAvailable?: boolean
ttsAvailable?: boolean
ttsEnabled?: boolean
ttsMode?: 'summary' | 'full'
onToggleTts?: () => void
onTtsModeChange?: (mode: 'summary' | 'full') => void
}
export function ChatInputWithMentions({
@ -482,6 +637,18 @@ export function ChatInputWithMentions({
runId,
initialDraft,
onDraftChange,
isRecording,
recordingText,
recordingState,
onStartRecording,
onSubmitRecording,
onCancelRecording,
voiceAvailable,
ttsAvailable,
ttsEnabled,
ttsMode,
onToggleTts,
onTtsModeChange,
}: ChatInputWithMentionsProps) {
return (
<PromptInputProvider knowledgeFiles={knowledgeFiles} recentFiles={recentFiles} visibleFiles={visibleFiles}>
@ -496,6 +663,18 @@ export function ChatInputWithMentions({
runId={runId}
initialDraft={initialDraft}
onDraftChange={onDraftChange}
isRecording={isRecording}
recordingText={recordingText}
recordingState={recordingState}
onStartRecording={onStartRecording}
onSubmitRecording={onSubmitRecording}
onCancelRecording={onCancelRecording}
voiceAvailable={voiceAvailable}
ttsAvailable={ttsAvailable}
ttsEnabled={ttsEnabled}
ttsMode={ttsMode}
onToggleTts={onToggleTts}
onTtsModeChange={onTtsModeChange}
/>
</PromptInputProvider>
)

View file

@ -108,6 +108,19 @@ interface ChatSidebarProps {
onToolOpenChangeForTab?: (tabId: string, toolId: string, open: boolean) => void
onOpenKnowledgeFile?: (path: string) => void
onActivate?: () => void
// Voice / TTS props
isRecording?: boolean
recordingText?: string
recordingState?: 'connecting' | 'listening'
onStartRecording?: () => void
onSubmitRecording?: () => void
onCancelRecording?: () => void
voiceAvailable?: boolean
ttsAvailable?: boolean
ttsEnabled?: boolean
ttsMode?: 'summary' | 'full'
onToggleTts?: () => void
onTtsModeChange?: (mode: 'summary' | 'full') => void
}
export function ChatSidebar({
@ -146,6 +159,18 @@ export function ChatSidebar({
onToolOpenChangeForTab,
onOpenKnowledgeFile,
onActivate,
isRecording,
recordingText,
recordingState,
onStartRecording,
onSubmitRecording,
onCancelRecording,
voiceAvailable,
ttsAvailable,
ttsEnabled,
ttsMode,
onToggleTts,
onTtsModeChange,
}: ChatSidebarProps) {
const [width, setWidth] = useState(() => getInitialPaneWidth(defaultWidth))
const [isResizing, setIsResizing] = useState(false)
@ -542,6 +567,18 @@ export function ChatSidebar({
runId={tabState.runId}
initialDraft={getInitialDraft?.(tab.id)}
onDraftChange={onDraftChangeForTab ? (text) => onDraftChangeForTab(tab.id, text) : undefined}
isRecording={isActive && isRecording}
recordingText={isActive ? recordingText : undefined}
recordingState={isActive ? recordingState : undefined}
onStartRecording={isActive ? onStartRecording : undefined}
onSubmitRecording={isActive ? onSubmitRecording : undefined}
onCancelRecording={isActive ? onCancelRecording : undefined}
voiceAvailable={isActive && voiceAvailable}
ttsAvailable={isActive && ttsAvailable}
ttsEnabled={ttsEnabled}
ttsMode={ttsMode}
onToggleTts={isActive ? onToggleTts : undefined}
onTtsModeChange={isActive ? onTtsModeChange : undefined}
/>
</div>
)

View file

@ -0,0 +1,218 @@
import { useCallback, useEffect, useRef, useState } from 'react';
export type VoiceState = 'idle' | 'connecting' | 'listening';
// Cache the API key so we skip the IPC call after first use
let cachedApiKey: string | null = null;
let apiKeyFetched = false;
export function useVoiceMode() {
const [state, setState] = useState<VoiceState>('idle');
const [interimText, setInterimText] = useState('');
const wsRef = useRef<WebSocket | null>(null);
const mediaStreamRef = useRef<MediaStream | null>(null);
const processorRef = useRef<ScriptProcessorNode | null>(null);
const audioCtxRef = useRef<AudioContext | null>(null);
const transcriptBufferRef = useRef('');
const interimRef = useRef('');
const reconnectTimerRef = useRef<ReturnType<typeof setTimeout> | null>(null);
const mountedRef = useRef(true);
// Connect (or reconnect) the Deepgram WebSocket.
// The WS stays open while the hook is mounted; only audio capture starts/stops per recording.
const connectWs = useCallback(() => {
if (!cachedApiKey) return;
if (wsRef.current && (wsRef.current.readyState === WebSocket.OPEN || wsRef.current.readyState === WebSocket.CONNECTING)) return;
const ws = new WebSocket(
`wss://api.deepgram.com/v1/listen?model=nova-3&encoding=linear16&sample_rate=16000&channels=1&interim_results=true&smart_format=true&punctuate=true&language=en`,
['token', cachedApiKey]
);
wsRef.current = ws;
ws.onopen = () => {
console.log('[voice] WebSocket connected');
};
ws.onmessage = (event) => {
const data = JSON.parse(event.data);
if (!data.channel?.alternatives?.[0]) return;
const transcript = data.channel.alternatives[0].transcript;
if (!transcript) return;
if (data.is_final) {
transcriptBufferRef.current += (transcriptBufferRef.current ? ' ' : '') + transcript;
interimRef.current = '';
setInterimText(transcriptBufferRef.current);
} else {
interimRef.current = transcript;
setInterimText(transcriptBufferRef.current + (transcriptBufferRef.current ? ' ' : '') + transcript);
}
};
ws.onerror = () => {
console.error('[voice] WebSocket error');
};
ws.onclose = () => {
console.log('[voice] WebSocket closed');
wsRef.current = null;
// Auto-reconnect after 3 seconds if still mounted
if (mountedRef.current && cachedApiKey) {
reconnectTimerRef.current = setTimeout(() => {
if (mountedRef.current) connectWs();
}, 3000);
}
};
}, []);
// Fetch API key on mount and establish persistent WebSocket
useEffect(() => {
mountedRef.current = true;
const init = async () => {
if (!apiKeyFetched) {
apiKeyFetched = true;
try {
const config = await window.ipc.invoke('voice:getConfig', null);
cachedApiKey = config.deepgram?.apiKey ?? null;
} catch { /* ignore */ }
}
if (cachedApiKey && mountedRef.current) {
connectWs();
}
};
void init();
return () => {
mountedRef.current = false;
if (reconnectTimerRef.current) {
clearTimeout(reconnectTimerRef.current);
reconnectTimerRef.current = null;
}
// Close WS on unmount, suppress reconnect by nulling onclose
if (wsRef.current) {
wsRef.current.onclose = null;
wsRef.current.close();
wsRef.current = null;
}
};
}, [connectWs]);
// Stop only audio capture (mic + processor), leaving WS open
const stopAudioCapture = useCallback(() => {
if (processorRef.current) {
processorRef.current.disconnect();
processorRef.current = null;
}
if (audioCtxRef.current) {
audioCtxRef.current.close();
audioCtxRef.current = null;
}
if (mediaStreamRef.current) {
mediaStreamRef.current.getTracks().forEach(t => t.stop());
mediaStreamRef.current = null;
}
setInterimText('');
transcriptBufferRef.current = '';
interimRef.current = '';
setState('idle');
}, []);
const start = useCallback(async () => {
if (state !== 'idle') return;
// Ensure we have an API key
if (!cachedApiKey) {
try {
const config = await window.ipc.invoke('voice:getConfig', null);
cachedApiKey = config.deepgram?.apiKey ?? null;
} catch { /* ignore */ }
}
if (!cachedApiKey) {
console.error('Deepgram not configured');
return;
}
transcriptBufferRef.current = '';
interimRef.current = '';
setInterimText('');
// If WS isn't connected, connect and wait for it
if (!wsRef.current || wsRef.current.readyState !== WebSocket.OPEN) {
setState('connecting');
connectWs();
// Wait for WS to be ready (up to 5 seconds)
const wsOk = await new Promise<boolean>((resolve) => {
const checkInterval = setInterval(() => {
if (wsRef.current?.readyState === WebSocket.OPEN) {
clearInterval(checkInterval);
resolve(true);
}
}, 50);
setTimeout(() => {
clearInterval(checkInterval);
resolve(false);
}, 5000);
});
if (!wsOk) {
setState('idle');
return;
}
}
setState('listening');
// Start mic
let stream: MediaStream | null = null;
try {
stream = await navigator.mediaDevices.getUserMedia({ audio: true });
} catch (err) {
console.error('Microphone access denied:', err);
setState('idle');
return;
}
mediaStreamRef.current = stream;
// Start audio capture
const audioCtx = new AudioContext({ sampleRate: 16000 });
audioCtxRef.current = audioCtx;
const source = audioCtx.createMediaStreamSource(stream);
const processor = audioCtx.createScriptProcessor(4096, 1, 1);
processorRef.current = processor;
processor.onaudioprocess = (e) => {
if (!wsRef.current || wsRef.current.readyState !== WebSocket.OPEN) return;
const float32 = e.inputBuffer.getChannelData(0);
const int16 = new Int16Array(float32.length);
for (let i = 0; i < float32.length; i++) {
const s = Math.max(-1, Math.min(1, float32[i]));
int16[i] = s < 0 ? s * 0x8000 : s * 0x7fff;
}
wsRef.current.send(int16.buffer);
};
source.connect(processor);
processor.connect(audioCtx.destination);
}, [state, connectWs]);
/** Stop recording and return the full transcript (finalized + any current interim) */
const submit = useCallback((): string => {
let text = transcriptBufferRef.current;
if (interimRef.current) {
text += (text ? ' ' : '') + interimRef.current;
}
text = text.trim();
stopAudioCapture();
return text;
}, [stopAudioCapture]);
/** Cancel recording without returning transcript */
const cancel = useCallback(() => {
stopAudioCapture();
}, [stopAudioCapture]);
return { state, interimText, start, submit, cancel };
}

View file

@ -0,0 +1,72 @@
import { useCallback, useRef, useState } from 'react';
export type TTSState = 'idle' | 'synthesizing' | 'speaking';
export function useVoiceTTS() {
const [state, setState] = useState<TTSState>('idle');
const audioRef = useRef<HTMLAudioElement | null>(null);
const queueRef = useRef<string[]>([]);
const processingRef = useRef(false);
const processQueue = useCallback(async () => {
if (processingRef.current) return;
processingRef.current = true;
while (queueRef.current.length > 0) {
const text = queueRef.current.shift()!;
if (!text.trim()) continue;
setState('synthesizing');
console.log('[tts] synthesizing:', text.substring(0, 80));
try {
const result = await window.ipc.invoke('voice:synthesize', { text });
console.log('[tts] got audio, mimeType:', result.mimeType, 'base64 length:', result.audioBase64.length);
setState('speaking');
await new Promise<void>((resolve, reject) => {
const dataUrl = `data:${result.mimeType};base64,${result.audioBase64}`;
const audio = new Audio(dataUrl);
audioRef.current = audio;
audio.onended = () => {
console.log('[tts] audio ended');
resolve();
};
audio.onerror = (e) => {
console.error('[tts] audio error:', e);
reject(new Error('Audio playback failed'));
};
audio.play().then(() => {
console.log('[tts] audio playing');
}).catch((err) => {
console.error('[tts] play() rejected:', err);
reject(err);
});
});
} catch (err) {
console.error('[tts] error:', err);
}
}
audioRef.current = null;
processingRef.current = false;
setState('idle');
}, []);
const speak = useCallback((text: string) => {
console.log('[tts] speak() called:', text.substring(0, 80));
queueRef.current.push(text);
processQueue();
}, [processQueue]);
const cancel = useCallback(() => {
queueRef.current = [];
if (audioRef.current) {
audioRef.current.pause();
audioRef.current = null;
}
processingRef.current = false;
setState('idle');
}, []);
return { state, speak, cancel };
}

View file

@ -894,11 +894,19 @@ export async function* streamAgent({
}
// get any queued user messages
let voiceInput = false;
let voiceOutput: 'summary' | 'full' | null = null;
while (true) {
const msg = await messageQueue.dequeue(runId);
if (!msg) {
break;
}
if (msg.voiceInput) {
voiceInput = true;
}
if (msg.voiceOutput) {
voiceOutput = msg.voiceOutput;
}
loopLogger.log('dequeued user message', msg.messageId);
yield* processEvent({
runId,
@ -938,7 +946,18 @@ export async function* streamAgent({
minute: '2-digit',
timeZoneName: 'short'
});
const instructionsWithDateTime = `Current date and time: ${currentDateTime}\n\n${agent.instructions}`;
let instructionsWithDateTime = `Current date and time: ${currentDateTime}\n\n${agent.instructions}`;
if (voiceInput) {
loopLogger.log('voice input enabled, injecting voice input prompt');
instructionsWithDateTime += `\n\n# Voice Input\nThe user's message was transcribed from speech. Be aware that:\n- There may be transcription errors. Silently correct obvious ones (e.g. homophones, misheard words). If an error is genuinely ambiguous, briefly mention your interpretation (e.g. "I'm assuming you meant X").\n- Spoken messages are often long-winded. The user may ramble, repeat themselves, or correct something they said earlier in the same message. Focus on their final intent, not every word verbatim.`;
}
if (voiceOutput === 'summary') {
loopLogger.log('voice output enabled (summary mode), injecting voice output prompt');
instructionsWithDateTime += `\n\n# Voice Output (MANDATORY)\nThe user has voice output enabled. You MUST start your response with <voice></voice> tags that provide a spoken summary and guide to your written response. This is NOT optional — every response MUST begin with <voice> tags.\n\nRules:\n1. ALWAYS start your response with one or more <voice> tags. Never skip them.\n2. Place ALL <voice> tags at the BEGINNING of your response, before any detailed content. Do NOT intersperse <voice> tags throughout the response.\n3. Wrap EACH spoken sentence in its own separate <voice> tag so it can be spoken incrementally. Do NOT wrap everything in a single <voice> block.\n4. Use voice as a TL;DR and navigation aid — do NOT read the entire response aloud.\n\nExample — if the user asks "what happened in my meeting with Sarah yesterday?":\n<voice>Your meeting with Sarah covered three main things: the Q2 roadmap timeline, hiring for the backend role, and the client demo next week.</voice>\n<voice>I've pulled out the key details and action items below — the demo prep notes are at the end.</voice>\n\n## Meeting with Sarah — March 11\n(Then the full detailed written response follows without any more <voice> tags.)\n\nAny text outside <voice> tags is shown visually but not spoken.`;
} else if (voiceOutput === 'full') {
loopLogger.log('voice output enabled (full mode), injecting voice output prompt');
instructionsWithDateTime += `\n\n# Voice Output — Full Read-Aloud (MANDATORY)\nThe user wants your ENTIRE response spoken aloud. You MUST wrap your full response in <voice></voice> tags. This is NOT optional.\n\nRules:\n1. Wrap EACH sentence in its own separate <voice> tag so it can be spoken incrementally.\n2. Write your response in a natural, conversational style suitable for listening — no markdown headings, bullet points, or formatting symbols. Use plain spoken language.\n3. Structure the content as if you are speaking to the user directly. Use transitions like "first", "also", "one more thing" instead of visual formatting.\n4. Every sentence MUST be inside a <voice> tag. Do not leave any content outside <voice> tags.\n\nExample:\n<voice>Your meeting with Sarah covered three main things.</voice>\n<voice>First, you discussed the Q2 roadmap timeline and agreed to push the launch to April.</voice>\n<voice>Second, you talked about hiring for the backend role — Sarah will send over two candidates by Friday.</voice>\n<voice>And lastly, the client demo is next week on Thursday at 2pm, and you're handling the intro slides.</voice>`;
}
let streamError: string | null = null;
for await (const event of streamLlm(
model,

View file

@ -33,6 +33,8 @@ Rowboat is an agentic assistant for everyday work - emails, meetings, projects,
**Document Collaboration:** When users ask you to work on a document, collaborate on writing, create a new document, edit/refine existing notes, or say things like "let's work on [X]", "help me write [X]", "create a doc for [X]", or "let's draft [X]", you MUST load the \`doc-collab\` skill first. This is required for any document creation or editing task. The skill provides structured guidance for creating, editing, and refining documents in the knowledge base.
**App Control:** When users ask you to open notes, show the bases or graph view, filter or search notes, or manage saved views, load the \`app-navigation\` skill first. It provides structured guidance for navigating the app UI and controlling the knowledge base view.
**Slack:** When users ask about Slack messages, want to send messages to teammates, check channel conversations, or find someone on Slack, load the \`slack\` skill. You can send messages, view channel history, search conversations, and find users. Always show message drafts to the user before sending.
## Memory That Compounds
@ -184,6 +186,7 @@ ${runtimeContextPrompt}
- \`loadSkill\` - Skill loading
- \`slack-checkConnection\`, \`slack-listAvailableTools\`, \`slack-executeAction\` - Slack integration (requires Slack to be connected via Composio). Use \`slack-listAvailableTools\` first to discover available tool slugs, then \`slack-executeAction\` to execute them.
- \`web-search\` and \`research-search\` - Web and research search tools (available when configured). **You MUST load the \`web-search\` skill before using either of these tools.** It tells you which tool to pick and how many searches to do.
- \`app-navigation\` - Control the app UI: open notes, switch views, filter/search the knowledge base, manage saved views. **Load the \`app-navigation\` skill before using this tool.**
**Prefer these tools whenever possible** they work instantly with zero friction. For file operations inside \`~/.rowboat/\`, always use these instead of \`executeCommand\`.

View file

@ -44,6 +44,7 @@ Change filters, columns, sort order, or search in the bases (table) view.
- If unsure what categories/values are available, call ` + "`get-base-state`" + ` first.
- For "show me X", prefer ` + "`filters.set`" + ` to start fresh rather than ` + "`filters.add`" + `.
- Categories come from frontmatter keys (e.g., relationship, status, topic, type).
- **CRITICAL: Do NOT pass ` + "`columns`" + ` unless the user explicitly asks to show/hide specific columns.** Omit the ` + "`columns`" + ` parameter entirely when only filtering, sorting, or searching. Passing ` + "`columns`" + ` will override the user's current column layout and can make the view appear empty.
### get-base-state
Retrieve information about what's in the knowledge base available filter categories, values, and note count.
@ -75,6 +76,7 @@ Save the current view configuration as a named base.
- The ` + "`update-base-view`" + ` action will automatically navigate to the bases view if the user isn't already there.
- ` + "`open-note`" + ` validates that the file exists before navigating.
- Filter categories and values come from frontmatter in knowledge files.
- **Never send ` + "`columns`" + ` or ` + "`sort`" + ` with ` + "`update-base-view`" + ` unless the user specifically asks to change them.** Only pass the parameters you intend to change omitted parameters are left untouched.
`;
export default skill;

View file

@ -884,6 +884,145 @@ export const BuiltinTools: z.infer<typeof BuiltinToolsSchema> = {
},
},
// ============================================================================
// App Navigation
// ============================================================================
'app-navigation': {
description: 'Control the app UI - navigate to notes, switch views, filter/search the knowledge base, and manage saved views.',
inputSchema: z.object({
action: z.enum(["open-note", "open-view", "update-base-view", "get-base-state", "create-base"]).describe("The navigation action to perform"),
// open-note
path: z.string().optional().describe("Knowledge file path for open-note, e.g. knowledge/People/John.md"),
// open-view
view: z.enum(["bases", "graph"]).optional().describe("Which view to open (for open-view action)"),
// update-base-view
filters: z.object({
set: z.array(z.object({ category: z.string(), value: z.string() })).optional().describe("Replace all filters with these"),
add: z.array(z.object({ category: z.string(), value: z.string() })).optional().describe("Add these filters"),
remove: z.array(z.object({ category: z.string(), value: z.string() })).optional().describe("Remove these filters"),
clear: z.boolean().optional().describe("Clear all filters"),
}).optional().describe("Filter modifications (for update-base-view)"),
columns: z.object({
set: z.array(z.string()).optional().describe("Replace visible columns with these"),
add: z.array(z.string()).optional().describe("Add these columns"),
remove: z.array(z.string()).optional().describe("Remove these columns"),
}).optional().describe("Column modifications (for update-base-view)"),
sort: z.object({
field: z.string(),
dir: z.enum(["asc", "desc"]),
}).optional().describe("Sort configuration (for update-base-view)"),
search: z.string().optional().describe("Search query to filter notes (for update-base-view)"),
// get-base-state
base_name: z.string().optional().describe("Name of a saved base to inspect (for get-base-state). Omit for the current/default view."),
// create-base
name: z.string().optional().describe("Name for the saved base view (for create-base)"),
}),
execute: async (input: {
action: string;
[key: string]: unknown;
}) => {
switch (input.action) {
case 'open-note': {
const filePath = input.path as string;
try {
const result = await workspace.exists(filePath);
if (!result.exists) {
return { success: false, error: `File not found: ${filePath}` };
}
return { success: true, action: 'open-note', path: filePath };
} catch {
return { success: false, error: `Could not access file: ${filePath}` };
}
}
case 'open-view': {
const view = input.view as string;
return { success: true, action: 'open-view', view };
}
case 'update-base-view': {
const updates: Record<string, unknown> = {};
if (input.filters) updates.filters = input.filters;
if (input.columns) updates.columns = input.columns;
if (input.sort) updates.sort = input.sort;
if (input.search !== undefined) updates.search = input.search;
return { success: true, action: 'update-base-view', updates };
}
case 'get-base-state': {
// Scan knowledge/ files and extract frontmatter properties
try {
const { parseFrontmatter } = await import("@x/shared/dist/frontmatter.js");
const entries = await workspace.readdir("knowledge", { recursive: true, allowedExtensions: [".md"] });
const files = entries.filter(e => e.kind === 'file');
const properties = new Map<string, Set<string>>();
let noteCount = 0;
for (const file of files) {
try {
const { data } = await workspace.readFile(file.path);
const { fields } = parseFrontmatter(data);
noteCount++;
for (const [key, value] of Object.entries(fields)) {
if (!value) continue;
let set = properties.get(key);
if (!set) { set = new Set(); properties.set(key, set); }
const values = Array.isArray(value) ? value : [value];
for (const v of values) {
const trimmed = v.trim();
if (trimmed) set.add(trimmed);
}
}
} catch {
// skip unreadable files
}
}
const availableProperties: Record<string, string[]> = {};
for (const [key, values] of properties) {
availableProperties[key] = [...values].sort();
}
return {
success: true,
action: 'get-base-state',
noteCount,
availableProperties,
};
} catch (error) {
return {
success: false,
error: error instanceof Error ? error.message : 'Failed to read knowledge base',
};
}
}
case 'create-base': {
const name = input.name as string;
const safeName = name.replace(/[^a-zA-Z0-9_\- ]/g, '').trim();
if (!safeName) {
return { success: false, error: 'Invalid base name' };
}
const basePath = `bases/${safeName}.base`;
try {
const config = { name: safeName, filters: [], columns: [] };
await workspace.writeFile(basePath, JSON.stringify(config, null, 2), { mkdirp: true });
return { success: true, action: 'create-base', name: safeName, path: basePath };
} catch (error) {
return {
success: false,
error: error instanceof Error ? error.message : 'Failed to create base',
};
}
}
default:
return { success: false, error: `Unknown action: ${input.action}` };
}
},
},
// ============================================================================
// Web Search (Brave Search API)
// ============================================================================

View file

@ -3,14 +3,17 @@ import { UserMessageContent } from "@x/shared/dist/message.js";
import z from "zod";
export type UserMessageContentType = z.infer<typeof UserMessageContent>;
export type VoiceOutputMode = 'summary' | 'full';
type EnqueuedMessage = {
messageId: string;
message: UserMessageContentType;
voiceInput?: boolean;
voiceOutput?: VoiceOutputMode;
};
export interface IMessageQueue {
enqueue(runId: string, message: UserMessageContentType): Promise<string>;
enqueue(runId: string, message: UserMessageContentType, voiceInput?: boolean, voiceOutput?: VoiceOutputMode): Promise<string>;
dequeue(runId: string): Promise<EnqueuedMessage | null>;
}
@ -26,7 +29,7 @@ export class InMemoryMessageQueue implements IMessageQueue {
this.idGenerator = idGenerator;
}
async enqueue(runId: string, message: UserMessageContentType): Promise<string> {
async enqueue(runId: string, message: UserMessageContentType, voiceInput?: boolean, voiceOutput?: VoiceOutputMode): Promise<string> {
if (!this.store[runId]) {
this.store[runId] = [];
}
@ -34,6 +37,8 @@ export class InMemoryMessageQueue implements IMessageQueue {
this.store[runId].push({
messageId: id,
message,
voiceInput,
voiceOutput,
});
return id;
}
@ -44,4 +49,4 @@ export class InMemoryMessageQueue implements IMessageQueue {
}
return this.store[runId].shift() ?? null;
}
}
}

View file

@ -9,3 +9,6 @@ export { initConfigs } from './config/initConfigs.js';
// Knowledge version history
export * as versionHistory from './knowledge/version_history.js';
// Voice mode (config + TTS)
export * as voice from './voice/voice.js';

View file

@ -64,7 +64,7 @@ export function createProvider(config: z.infer<typeof Provider>): ProviderV2 {
apiKey,
baseURL,
headers,
});
}) as unknown as ProviderV2;
default:
throw new Error(`Unsupported provider flavor: ${config.flavor}`);
}

View file

@ -1,6 +1,6 @@
import z from "zod";
import container from "../di/container.js";
import { IMessageQueue, UserMessageContentType } from "../application/lib/message-queue.js";
import { IMessageQueue, UserMessageContentType, VoiceOutputMode } from "../application/lib/message-queue.js";
import { AskHumanResponseEvent, ToolPermissionRequestEvent, ToolPermissionResponseEvent, CreateRunOptions, Run, ListRunsResponse, ToolPermissionAuthorizePayload, AskHumanResponsePayload } from "@x/shared/dist/runs.js";
import { IRunsRepo } from "./repo.js";
import { IAgentRuntime } from "../agents/runtime.js";
@ -19,9 +19,9 @@ export async function createRun(opts: z.infer<typeof CreateRunOptions>): Promise
return run;
}
export async function createMessage(runId: string, message: UserMessageContentType): Promise<string> {
export async function createMessage(runId: string, message: UserMessageContentType, voiceInput?: boolean, voiceOutput?: VoiceOutputMode): Promise<string> {
const queue = container.resolve<IMessageQueue>('messageQueue');
const id = await queue.enqueue(runId, message);
const id = await queue.enqueue(runId, message, voiceInput, voiceOutput);
const runtime = container.resolve<IAgentRuntime>('agentRuntime');
runtime.trigger(runId);
return id;

View file

@ -0,0 +1,70 @@
import * as fs from 'fs/promises';
import * as path from 'path';
const homedir = process.env.HOME || process.env.USERPROFILE || '';
export interface VoiceConfig {
deepgram: { apiKey: string } | null;
elevenlabs: { apiKey: string; voiceId?: string } | null;
}
async function readJsonConfig(filename: string): Promise<Record<string, unknown> | null> {
try {
const configPath = path.join(homedir, '.rowboat', 'config', filename);
const raw = await fs.readFile(configPath, 'utf8');
return JSON.parse(raw);
} catch {
return null;
}
}
export async function getVoiceConfig(): Promise<VoiceConfig> {
const dgConfig = await readJsonConfig('deepgram.json');
const elConfig = await readJsonConfig('elevenlabs.json');
return {
deepgram: dgConfig?.apiKey ? { apiKey: dgConfig.apiKey as string } : null,
elevenlabs: elConfig?.apiKey
? { apiKey: elConfig.apiKey as string, voiceId: elConfig.voiceId as string | undefined }
: null,
};
}
export async function synthesizeSpeech(text: string): Promise<{ audioBase64: string; mimeType: string }> {
const config = await getVoiceConfig();
if (!config.elevenlabs) {
throw new Error('ElevenLabs not configured. Create ~/.rowboat/config/elevenlabs.json with { "apiKey": "<your-key>" }');
}
const voiceId = config.elevenlabs.voiceId || 'UgBBYS2sOqTuMpoF3BR0';
const url = `https://api.elevenlabs.io/v1/text-to-speech/${voiceId}`;
console.log('[voice] synthesizing speech, text length:', text.length, 'voiceId:', voiceId);
const response = await fetch(url, {
method: 'POST',
headers: {
'xi-api-key': config.elevenlabs.apiKey,
'Content-Type': 'application/json',
},
body: JSON.stringify({
text,
model_id: 'eleven_multilingual_v2',
voice_settings: {
stability: 0.5,
similarity_boost: 0.75,
},
}),
});
if (!response.ok) {
const errText = await response.text().catch(() => 'Unknown error');
console.error('[voice] ElevenLabs API error:', response.status, errText);
throw new Error(`ElevenLabs API error ${response.status}: ${errText}`);
}
const arrayBuffer = await response.arrayBuffer();
const audioBase64 = Buffer.from(arrayBuffer).toString('base64');
console.log('[voice] synthesized audio, base64 length:', audioBase64.length);
return { audioBase64, mimeType: 'audio/mpeg' };
}

View file

@ -130,6 +130,8 @@ const ipcSchemas = {
req: z.object({
runId: z.string(),
message: UserMessageContent,
voiceInput: z.boolean().optional(),
voiceOutput: z.enum(['summary', 'full']).optional(),
}),
res: z.object({
messageId: z.string(),
@ -460,6 +462,23 @@ const ipcSchemas = {
})),
}),
},
// Voice mode channels
'voice:getConfig': {
req: z.null(),
res: z.object({
deepgram: z.object({ apiKey: z.string() }).nullable(),
elevenlabs: z.object({ apiKey: z.string(), voiceId: z.string().optional() }).nullable(),
}),
},
'voice:synthesize': {
req: z.object({
text: z.string(),
}),
res: z.object({
audioBase64: z.string(),
mimeType: z.string(),
}),
},
// Inline task schedule classification
'inline-task:classifySchedule': {
req: z.object({