mirror of
https://github.com/rowboatlabs/rowboat.git
synced 2026-05-19 18:35:18 +02:00
voice mode with TTS input/output
This commit is contained in:
parent
d150294af1
commit
47d5118448
17 changed files with 937 additions and 15 deletions
|
|
@ -38,7 +38,7 @@ import { IAgentScheduleRepo } from '@x/core/dist/agent-schedule/repo.js';
|
|||
import { IAgentScheduleStateRepo } from '@x/core/dist/agent-schedule/state-repo.js';
|
||||
import { triggerRun as triggerAgentScheduleRun } from '@x/core/dist/agent-schedule/runner.js';
|
||||
import { search } from '@x/core/dist/search/search.js';
|
||||
import { versionHistory } from '@x/core';
|
||||
import { versionHistory, voice } from '@x/core';
|
||||
import { classifySchedule } from '@x/core/dist/knowledge/inline_tasks.js';
|
||||
|
||||
type InvokeChannels = ipc.InvokeChannels;
|
||||
|
|
@ -352,7 +352,7 @@ export function setupIpcHandlers() {
|
|||
return runsCore.createRun(args);
|
||||
},
|
||||
'runs:createMessage': async (_event, args) => {
|
||||
return { messageId: await runsCore.createMessage(args.runId, args.message) };
|
||||
return { messageId: await runsCore.createMessage(args.runId, args.message, args.voiceInput, args.voiceOutput) };
|
||||
},
|
||||
'runs:authorizePermission': async (_event, args) => {
|
||||
await runsCore.authorizePermission(args.runId, args.authorization);
|
||||
|
|
@ -571,5 +571,11 @@ export function setupIpcHandlers() {
|
|||
const schedule = await classifySchedule(args.instruction);
|
||||
return { schedule };
|
||||
},
|
||||
'voice:getConfig': async () => {
|
||||
return voice.getVoiceConfig();
|
||||
},
|
||||
'voice:synthesize': async (_event, args) => {
|
||||
return voice.synthesizeSpeech(args.text);
|
||||
},
|
||||
});
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,4 +1,4 @@
|
|||
import { app, BrowserWindow, protocol, net, shell } from "electron";
|
||||
import { app, BrowserWindow, protocol, net, shell, session } from "electron";
|
||||
import path from "node:path";
|
||||
import {
|
||||
setupIpcHandlers,
|
||||
|
|
@ -92,6 +92,15 @@ function createWindow() {
|
|||
},
|
||||
});
|
||||
|
||||
// Grant microphone permission for voice mode
|
||||
session.defaultSession.setPermissionRequestHandler((_webContents, permission, callback) => {
|
||||
if (permission === 'media') {
|
||||
callback(true);
|
||||
} else {
|
||||
callback(false);
|
||||
}
|
||||
});
|
||||
|
||||
// Show window when content is ready to prevent blank screen
|
||||
win.once("ready-to-show", () => {
|
||||
win.show();
|
||||
|
|
|
|||
|
|
@ -76,6 +76,8 @@ import {
|
|||
import { AgentScheduleConfig } from '@x/shared/dist/agent-schedule.js'
|
||||
import { AgentScheduleState } from '@x/shared/dist/agent-schedule-state.js'
|
||||
import { toast } from "sonner"
|
||||
import { useVoiceMode } from '@/hooks/useVoiceMode'
|
||||
import { useVoiceTTS } from '@/hooks/useVoiceTTS'
|
||||
|
||||
type DirEntry = z.infer<typeof workspace.DirEntry>
|
||||
type RunEventType = z.infer<typeof RunEvent>
|
||||
|
|
@ -546,6 +548,87 @@ function App() {
|
|||
const [agentId] = useState<string>('copilot')
|
||||
const [presetMessage, setPresetMessage] = useState<string | undefined>(undefined)
|
||||
|
||||
// Voice mode state
|
||||
const [voiceAvailable, setVoiceAvailable] = useState(false)
|
||||
const [ttsAvailable, setTtsAvailable] = useState(false)
|
||||
const [ttsEnabled, setTtsEnabled] = useState(false)
|
||||
const ttsEnabledRef = useRef(false)
|
||||
const [ttsMode, setTtsMode] = useState<'summary' | 'full'>('summary')
|
||||
const ttsModeRef = useRef<'summary' | 'full'>('summary')
|
||||
const [isRecording, setIsRecording] = useState(false)
|
||||
const voiceTextBufferRef = useRef('')
|
||||
const spokenIndexRef = useRef(0)
|
||||
const isRecordingRef = useRef(false)
|
||||
|
||||
const tts = useVoiceTTS()
|
||||
const ttsRef = useRef(tts)
|
||||
ttsRef.current = tts
|
||||
|
||||
const voice = useVoiceMode()
|
||||
const voiceRef = useRef(voice)
|
||||
voiceRef.current = voice
|
||||
|
||||
// Check if voice is available on mount
|
||||
useEffect(() => {
|
||||
window.ipc.invoke('voice:getConfig', null).then(config => {
|
||||
setVoiceAvailable(!!config.deepgram)
|
||||
setTtsAvailable(!!config.elevenlabs)
|
||||
}).catch(() => {
|
||||
setVoiceAvailable(false)
|
||||
setTtsAvailable(false)
|
||||
})
|
||||
}, [])
|
||||
|
||||
const handleStartRecording = useCallback(() => {
|
||||
setIsRecording(true)
|
||||
isRecordingRef.current = true
|
||||
voice.start()
|
||||
}, [voice])
|
||||
|
||||
const handlePromptSubmitRef = useRef<((msg: { text: string }) => void) | null>(null)
|
||||
const pendingVoiceInputRef = useRef(false)
|
||||
|
||||
const handleSubmitRecording = useCallback(() => {
|
||||
const text = voice.submit()
|
||||
setIsRecording(false)
|
||||
isRecordingRef.current = false
|
||||
if (text) {
|
||||
pendingVoiceInputRef.current = true
|
||||
handlePromptSubmitRef.current?.({ text })
|
||||
}
|
||||
}, [voice])
|
||||
|
||||
const handleToggleTts = useCallback(() => {
|
||||
setTtsEnabled(prev => {
|
||||
const next = !prev
|
||||
ttsEnabledRef.current = next
|
||||
if (!next) {
|
||||
ttsRef.current.cancel()
|
||||
}
|
||||
return next
|
||||
})
|
||||
}, [])
|
||||
|
||||
const handleTtsModeChange = useCallback((mode: 'summary' | 'full') => {
|
||||
setTtsMode(mode)
|
||||
ttsModeRef.current = mode
|
||||
}, [])
|
||||
|
||||
const handleCancelRecording = useCallback(() => {
|
||||
voice.cancel()
|
||||
setIsRecording(false)
|
||||
isRecordingRef.current = false
|
||||
}, [voice])
|
||||
|
||||
// Helper to cancel recording from any navigation handler
|
||||
const cancelRecordingIfActive = useCallback(() => {
|
||||
if (isRecordingRef.current) {
|
||||
voiceRef.current.cancel()
|
||||
setIsRecording(false)
|
||||
isRecordingRef.current = false
|
||||
}
|
||||
}, [])
|
||||
|
||||
// Runs history state
|
||||
type RunListItem = { id: string; title?: string; createdAt: string; agentId: string }
|
||||
const [runs, setRuns] = useState<RunListItem[]>([])
|
||||
|
|
@ -1496,6 +1579,9 @@ function App() {
|
|||
if (!isActiveRun) return
|
||||
setIsProcessing(true)
|
||||
setModelUsage(null)
|
||||
// Reset voice buffer for new response
|
||||
voiceTextBufferRef.current = ''
|
||||
spokenIndexRef.current = 0
|
||||
break
|
||||
|
||||
case 'run-processing-end':
|
||||
|
|
@ -1545,6 +1631,20 @@ function App() {
|
|||
if (llmEvent.type === 'text-delta' && llmEvent.delta) {
|
||||
appendStreamingBuffer(event.runId, llmEvent.delta)
|
||||
setCurrentAssistantMessage(prev => prev + llmEvent.delta)
|
||||
|
||||
// Extract <voice> tags and send to TTS when enabled
|
||||
voiceTextBufferRef.current += llmEvent.delta
|
||||
const remaining = voiceTextBufferRef.current.substring(spokenIndexRef.current)
|
||||
const voiceRegex = /<voice>([\s\S]*?)<\/voice>/g
|
||||
let voiceMatch: RegExpExecArray | null
|
||||
while ((voiceMatch = voiceRegex.exec(remaining)) !== null) {
|
||||
const voiceContent = voiceMatch[1].trim()
|
||||
console.log('[voice] extracted voice tag:', voiceContent)
|
||||
if (voiceContent && ttsEnabledRef.current) {
|
||||
ttsRef.current.speak(voiceContent)
|
||||
}
|
||||
spokenIndexRef.current += voiceMatch.index + voiceMatch[0].length
|
||||
}
|
||||
} else if (llmEvent.type === 'tool-call') {
|
||||
setConversation(prev => [...prev, {
|
||||
id: llmEvent.toolCallId || `tool-${Date.now()}`,
|
||||
|
|
@ -1584,6 +1684,7 @@ function App() {
|
|||
if (msg.role === 'assistant') {
|
||||
setCurrentAssistantMessage(currentMsg => {
|
||||
if (currentMsg) {
|
||||
const cleanedContent = currentMsg.replace(/<\/?voice>/g, '')
|
||||
setConversation(prev => {
|
||||
const exists = prev.some(m =>
|
||||
m.id === event.messageId && 'role' in m && m.role === 'assistant'
|
||||
|
|
@ -1592,7 +1693,7 @@ function App() {
|
|||
return [...prev, {
|
||||
id: event.messageId,
|
||||
role: 'assistant',
|
||||
content: currentMsg,
|
||||
content: cleanedContent,
|
||||
timestamp: Date.now(),
|
||||
}]
|
||||
})
|
||||
|
|
@ -1887,6 +1988,8 @@ function App() {
|
|||
await window.ipc.invoke('runs:createMessage', {
|
||||
runId: currentRunId,
|
||||
message: attachmentPayload,
|
||||
voiceInput: pendingVoiceInputRef.current || undefined,
|
||||
voiceOutput: ttsEnabledRef.current ? ttsModeRef.current : undefined,
|
||||
})
|
||||
} else {
|
||||
// Legacy path: plain string with optional XML-formatted @mentions.
|
||||
|
|
@ -1915,11 +2018,15 @@ function App() {
|
|||
await window.ipc.invoke('runs:createMessage', {
|
||||
runId: currentRunId,
|
||||
message: formattedMessage,
|
||||
voiceInput: pendingVoiceInputRef.current || undefined,
|
||||
voiceOutput: ttsEnabledRef.current ? ttsModeRef.current : undefined,
|
||||
})
|
||||
|
||||
titleSource = formattedMessage
|
||||
}
|
||||
|
||||
pendingVoiceInputRef.current = false
|
||||
|
||||
if (isNewRun) {
|
||||
const inferredTitle = inferRunTitleFromMessage(titleSource)
|
||||
setRuns((prev) => {
|
||||
|
|
@ -1936,6 +2043,7 @@ function App() {
|
|||
console.error('Failed to send message:', error)
|
||||
}
|
||||
}
|
||||
handlePromptSubmitRef.current = handlePromptSubmit
|
||||
|
||||
const handleStop = useCallback(async () => {
|
||||
if (!runId) return
|
||||
|
|
@ -2065,6 +2173,7 @@ function App() {
|
|||
}, [])
|
||||
|
||||
const openChatInNewTab = useCallback((targetRunId: string) => {
|
||||
cancelRecordingIfActive()
|
||||
const existingTab = chatTabs.find(t => t.runId === targetRunId)
|
||||
if (existingTab) {
|
||||
// Cancel stale in-flight loads from previously focused tabs.
|
||||
|
|
@ -2080,12 +2189,18 @@ function App() {
|
|||
setChatTabs(prev => [...prev, { id, runId: targetRunId }])
|
||||
setActiveChatTabId(id)
|
||||
loadRun(targetRunId)
|
||||
}, [chatTabs, loadRun, restoreChatTabState])
|
||||
}, [chatTabs, loadRun, restoreChatTabState, cancelRecordingIfActive])
|
||||
|
||||
const switchChatTab = useCallback((tabId: string) => {
|
||||
const tab = chatTabs.find(t => t.id === tabId)
|
||||
if (!tab) return
|
||||
if (tabId === activeChatTabId) return
|
||||
// Cancel any active recording when switching tabs
|
||||
if (isRecordingRef.current) {
|
||||
voiceRef.current.cancel()
|
||||
setIsRecording(false)
|
||||
isRecordingRef.current = false
|
||||
}
|
||||
saveChatScrollForTab(activeChatTabId)
|
||||
// Cancel stale in-flight loads from previously focused tabs.
|
||||
loadRunRequestIdRef.current += 1
|
||||
|
|
@ -2471,13 +2586,14 @@ function App() {
|
|||
const current = currentViewState
|
||||
if (viewStatesEqual(current, nextView)) return
|
||||
|
||||
cancelRecordingIfActive()
|
||||
const nextHistory = {
|
||||
back: appendUnique(historyRef.current.back, current),
|
||||
forward: [] as ViewState[],
|
||||
}
|
||||
setHistory(nextHistory)
|
||||
await applyViewState(nextView)
|
||||
}, [appendUnique, applyViewState, currentViewState, setHistory])
|
||||
}, [appendUnique, applyViewState, cancelRecordingIfActive, currentViewState, setHistory])
|
||||
|
||||
const navigateBack = useCallback(async () => {
|
||||
const { back, forward } = historyRef.current
|
||||
|
|
@ -3412,6 +3528,7 @@ function App() {
|
|||
tasksActions={{
|
||||
onNewChat: handleNewChatTab,
|
||||
onSelectRun: (runIdToLoad) => {
|
||||
cancelRecordingIfActive()
|
||||
if (selectedPath || isGraphOpen) {
|
||||
setIsChatSidebarOpen(true)
|
||||
}
|
||||
|
|
@ -3814,7 +3931,7 @@ function App() {
|
|||
{tabState.currentAssistantMessage && (
|
||||
<Message from="assistant">
|
||||
<MessageContent>
|
||||
<MessageResponse components={streamdownComponents}>{tabState.currentAssistantMessage}</MessageResponse>
|
||||
<MessageResponse components={streamdownComponents}>{tabState.currentAssistantMessage.replace(/<\/?voice>/g, '')}</MessageResponse>
|
||||
</MessageContent>
|
||||
</Message>
|
||||
)}
|
||||
|
|
@ -3865,6 +3982,18 @@ function App() {
|
|||
runId={tabState.runId}
|
||||
initialDraft={chatDraftsRef.current.get(tab.id)}
|
||||
onDraftChange={(text) => setChatDraftForTab(tab.id, text)}
|
||||
isRecording={isActive && isRecording}
|
||||
recordingText={isActive ? voice.interimText : undefined}
|
||||
recordingState={isActive ? (voice.state === 'connecting' ? 'connecting' : 'listening') : undefined}
|
||||
onStartRecording={isActive ? handleStartRecording : undefined}
|
||||
onSubmitRecording={isActive ? handleSubmitRecording : undefined}
|
||||
onCancelRecording={isActive ? handleCancelRecording : undefined}
|
||||
voiceAvailable={isActive && voiceAvailable}
|
||||
ttsAvailable={isActive && ttsAvailable}
|
||||
ttsEnabled={ttsEnabled}
|
||||
ttsMode={ttsMode}
|
||||
onToggleTts={isActive ? handleToggleTts : undefined}
|
||||
onTtsModeChange={isActive ? handleTtsModeChange : undefined}
|
||||
/>
|
||||
</div>
|
||||
)
|
||||
|
|
@ -3914,6 +4043,18 @@ function App() {
|
|||
onToolOpenChangeForTab={setToolOpenForTab}
|
||||
onOpenKnowledgeFile={(path) => { navigateToFile(path) }}
|
||||
onActivate={() => setActiveShortcutPane('right')}
|
||||
isRecording={isRecording}
|
||||
recordingText={voice.interimText}
|
||||
recordingState={voice.state === 'connecting' ? 'connecting' : 'listening'}
|
||||
onStartRecording={handleStartRecording}
|
||||
onSubmitRecording={handleSubmitRecording}
|
||||
onCancelRecording={handleCancelRecording}
|
||||
voiceAvailable={voiceAvailable}
|
||||
ttsAvailable={ttsAvailable}
|
||||
ttsEnabled={ttsEnabled}
|
||||
ttsMode={ttsMode}
|
||||
onToggleTts={handleToggleTts}
|
||||
onTtsModeChange={handleTtsModeChange}
|
||||
/>
|
||||
)}
|
||||
{/* Rendered last so its no-drag region paints over the sidebar drag region */}
|
||||
|
|
|
|||
|
|
@ -1,4 +1,5 @@
|
|||
import { useCallback, useEffect, useRef, useState } from 'react'
|
||||
import { Tooltip, TooltipContent, TooltipTrigger } from '@/components/ui/tooltip'
|
||||
import {
|
||||
ArrowUp,
|
||||
AudioLines,
|
||||
|
|
@ -9,7 +10,9 @@ import {
|
|||
FileSpreadsheet,
|
||||
FileText,
|
||||
FileVideo,
|
||||
Headphones,
|
||||
LoaderIcon,
|
||||
Mic,
|
||||
Plus,
|
||||
Square,
|
||||
X,
|
||||
|
|
@ -102,6 +105,18 @@ interface ChatInputInnerProps {
|
|||
runId?: string | null
|
||||
initialDraft?: string
|
||||
onDraftChange?: (text: string) => void
|
||||
isRecording?: boolean
|
||||
recordingText?: string
|
||||
recordingState?: 'connecting' | 'listening'
|
||||
onStartRecording?: () => void
|
||||
onSubmitRecording?: () => void
|
||||
onCancelRecording?: () => void
|
||||
voiceAvailable?: boolean
|
||||
ttsAvailable?: boolean
|
||||
ttsEnabled?: boolean
|
||||
ttsMode?: 'summary' | 'full'
|
||||
onToggleTts?: () => void
|
||||
onTtsModeChange?: (mode: 'summary' | 'full') => void
|
||||
}
|
||||
|
||||
function ChatInputInner({
|
||||
|
|
@ -115,6 +130,18 @@ function ChatInputInner({
|
|||
runId,
|
||||
initialDraft,
|
||||
onDraftChange,
|
||||
isRecording,
|
||||
recordingText,
|
||||
recordingState,
|
||||
onStartRecording,
|
||||
onSubmitRecording,
|
||||
onCancelRecording,
|
||||
voiceAvailable,
|
||||
ttsAvailable,
|
||||
ttsEnabled,
|
||||
ttsMode,
|
||||
onToggleTts,
|
||||
onTtsModeChange,
|
||||
}: ChatInputInnerProps) {
|
||||
const controller = usePromptInputController()
|
||||
const message = controller.textInput.value
|
||||
|
|
@ -367,6 +394,40 @@ function ChatInputInner({
|
|||
e.target.value = ''
|
||||
}}
|
||||
/>
|
||||
{isRecording ? (
|
||||
/* ── Recording bar ── */
|
||||
<div className="flex items-center gap-3 px-4 py-3">
|
||||
<button
|
||||
type="button"
|
||||
onClick={onCancelRecording}
|
||||
className="flex h-7 w-7 shrink-0 items-center justify-center rounded-full text-muted-foreground transition-colors hover:bg-muted hover:text-foreground"
|
||||
aria-label="Cancel recording"
|
||||
>
|
||||
<X className="h-4 w-4" />
|
||||
</button>
|
||||
<div className="flex flex-1 items-center gap-2 overflow-hidden">
|
||||
<VoiceWaveform />
|
||||
<span className="min-w-0 flex-1 truncate text-sm text-muted-foreground">
|
||||
{recordingState === 'connecting' ? 'Connecting...' : recordingText || 'Listening...'}
|
||||
</span>
|
||||
</div>
|
||||
<Button
|
||||
size="icon"
|
||||
onClick={onSubmitRecording}
|
||||
disabled={!recordingText?.trim()}
|
||||
className={cn(
|
||||
'h-7 w-7 shrink-0 rounded-full transition-all',
|
||||
recordingText?.trim()
|
||||
? 'bg-primary text-primary-foreground hover:bg-primary/90'
|
||||
: 'bg-muted text-muted-foreground'
|
||||
)}
|
||||
>
|
||||
<ArrowUp className="h-4 w-4" />
|
||||
</Button>
|
||||
</div>
|
||||
) : (
|
||||
/* ── Normal input ── */
|
||||
<>
|
||||
<div className="px-4 pt-4 pb-2">
|
||||
<PromptInputTextarea
|
||||
placeholder="Type your message..."
|
||||
|
|
@ -414,6 +475,63 @@ function ChatInputInner({
|
|||
</DropdownMenuContent>
|
||||
</DropdownMenu>
|
||||
)}
|
||||
{onToggleTts && ttsAvailable && (
|
||||
<div className="flex shrink-0 items-center">
|
||||
<Tooltip>
|
||||
<TooltipTrigger asChild>
|
||||
<button
|
||||
type="button"
|
||||
onClick={onToggleTts}
|
||||
className={cn(
|
||||
'relative flex h-7 w-7 shrink-0 items-center justify-center rounded-full transition-colors',
|
||||
ttsEnabled
|
||||
? 'text-foreground hover:bg-muted'
|
||||
: 'text-muted-foreground hover:bg-muted hover:text-foreground'
|
||||
)}
|
||||
aria-label={ttsEnabled ? 'Disable voice output' : 'Enable voice output'}
|
||||
>
|
||||
<Headphones className="h-4 w-4" />
|
||||
{!ttsEnabled && (
|
||||
<span className="absolute inset-0 flex items-center justify-center pointer-events-none">
|
||||
<span className="block h-[1.5px] w-5 -rotate-45 rounded-full bg-muted-foreground" />
|
||||
</span>
|
||||
)}
|
||||
</button>
|
||||
</TooltipTrigger>
|
||||
<TooltipContent side="top">
|
||||
{ttsEnabled ? 'Voice output on' : 'Voice output off'}
|
||||
</TooltipContent>
|
||||
</Tooltip>
|
||||
{ttsEnabled && onTtsModeChange && (
|
||||
<DropdownMenu>
|
||||
<DropdownMenuTrigger asChild>
|
||||
<button
|
||||
type="button"
|
||||
className="flex h-7 w-4 shrink-0 items-center justify-center text-muted-foreground transition-colors hover:text-foreground"
|
||||
>
|
||||
<ChevronDown className="h-3 w-3" />
|
||||
</button>
|
||||
</DropdownMenuTrigger>
|
||||
<DropdownMenuContent align="end">
|
||||
<DropdownMenuRadioGroup value={ttsMode ?? 'summary'} onValueChange={(v) => onTtsModeChange(v as 'summary' | 'full')}>
|
||||
<DropdownMenuRadioItem value="summary">Speak summary</DropdownMenuRadioItem>
|
||||
<DropdownMenuRadioItem value="full">Speak full response</DropdownMenuRadioItem>
|
||||
</DropdownMenuRadioGroup>
|
||||
</DropdownMenuContent>
|
||||
</DropdownMenu>
|
||||
)}
|
||||
</div>
|
||||
)}
|
||||
{voiceAvailable && onStartRecording && (
|
||||
<button
|
||||
type="button"
|
||||
onClick={onStartRecording}
|
||||
className="flex h-7 w-7 shrink-0 items-center justify-center rounded-full text-muted-foreground transition-colors hover:bg-muted hover:text-foreground"
|
||||
aria-label="Voice input"
|
||||
>
|
||||
<Mic className="h-4 w-4" />
|
||||
</button>
|
||||
)}
|
||||
{isProcessing ? (
|
||||
<Button
|
||||
size="icon"
|
||||
|
|
@ -448,6 +566,31 @@ function ChatInputInner({
|
|||
</Button>
|
||||
)}
|
||||
</div>
|
||||
</>
|
||||
)}
|
||||
</div>
|
||||
)
|
||||
}
|
||||
|
||||
/** Animated waveform bars for the recording indicator */
|
||||
function VoiceWaveform() {
|
||||
return (
|
||||
<div className="flex items-center gap-[3px] h-5">
|
||||
{[0, 1, 2, 3, 4].map((i) => (
|
||||
<span
|
||||
key={i}
|
||||
className="w-[3px] rounded-full bg-primary"
|
||||
style={{
|
||||
animation: `voice-wave 1.2s ease-in-out ${i * 0.15}s infinite`,
|
||||
}}
|
||||
/>
|
||||
))}
|
||||
<style>{`
|
||||
@keyframes voice-wave {
|
||||
0%, 100% { height: 4px; }
|
||||
50% { height: 16px; }
|
||||
}
|
||||
`}</style>
|
||||
</div>
|
||||
)
|
||||
}
|
||||
|
|
@ -466,6 +609,18 @@ export interface ChatInputWithMentionsProps {
|
|||
runId?: string | null
|
||||
initialDraft?: string
|
||||
onDraftChange?: (text: string) => void
|
||||
isRecording?: boolean
|
||||
recordingText?: string
|
||||
recordingState?: 'connecting' | 'listening'
|
||||
onStartRecording?: () => void
|
||||
onSubmitRecording?: () => void
|
||||
onCancelRecording?: () => void
|
||||
voiceAvailable?: boolean
|
||||
ttsAvailable?: boolean
|
||||
ttsEnabled?: boolean
|
||||
ttsMode?: 'summary' | 'full'
|
||||
onToggleTts?: () => void
|
||||
onTtsModeChange?: (mode: 'summary' | 'full') => void
|
||||
}
|
||||
|
||||
export function ChatInputWithMentions({
|
||||
|
|
@ -482,6 +637,18 @@ export function ChatInputWithMentions({
|
|||
runId,
|
||||
initialDraft,
|
||||
onDraftChange,
|
||||
isRecording,
|
||||
recordingText,
|
||||
recordingState,
|
||||
onStartRecording,
|
||||
onSubmitRecording,
|
||||
onCancelRecording,
|
||||
voiceAvailable,
|
||||
ttsAvailable,
|
||||
ttsEnabled,
|
||||
ttsMode,
|
||||
onToggleTts,
|
||||
onTtsModeChange,
|
||||
}: ChatInputWithMentionsProps) {
|
||||
return (
|
||||
<PromptInputProvider knowledgeFiles={knowledgeFiles} recentFiles={recentFiles} visibleFiles={visibleFiles}>
|
||||
|
|
@ -496,6 +663,18 @@ export function ChatInputWithMentions({
|
|||
runId={runId}
|
||||
initialDraft={initialDraft}
|
||||
onDraftChange={onDraftChange}
|
||||
isRecording={isRecording}
|
||||
recordingText={recordingText}
|
||||
recordingState={recordingState}
|
||||
onStartRecording={onStartRecording}
|
||||
onSubmitRecording={onSubmitRecording}
|
||||
onCancelRecording={onCancelRecording}
|
||||
voiceAvailable={voiceAvailable}
|
||||
ttsAvailable={ttsAvailable}
|
||||
ttsEnabled={ttsEnabled}
|
||||
ttsMode={ttsMode}
|
||||
onToggleTts={onToggleTts}
|
||||
onTtsModeChange={onTtsModeChange}
|
||||
/>
|
||||
</PromptInputProvider>
|
||||
)
|
||||
|
|
|
|||
|
|
@ -108,6 +108,19 @@ interface ChatSidebarProps {
|
|||
onToolOpenChangeForTab?: (tabId: string, toolId: string, open: boolean) => void
|
||||
onOpenKnowledgeFile?: (path: string) => void
|
||||
onActivate?: () => void
|
||||
// Voice / TTS props
|
||||
isRecording?: boolean
|
||||
recordingText?: string
|
||||
recordingState?: 'connecting' | 'listening'
|
||||
onStartRecording?: () => void
|
||||
onSubmitRecording?: () => void
|
||||
onCancelRecording?: () => void
|
||||
voiceAvailable?: boolean
|
||||
ttsAvailable?: boolean
|
||||
ttsEnabled?: boolean
|
||||
ttsMode?: 'summary' | 'full'
|
||||
onToggleTts?: () => void
|
||||
onTtsModeChange?: (mode: 'summary' | 'full') => void
|
||||
}
|
||||
|
||||
export function ChatSidebar({
|
||||
|
|
@ -146,6 +159,18 @@ export function ChatSidebar({
|
|||
onToolOpenChangeForTab,
|
||||
onOpenKnowledgeFile,
|
||||
onActivate,
|
||||
isRecording,
|
||||
recordingText,
|
||||
recordingState,
|
||||
onStartRecording,
|
||||
onSubmitRecording,
|
||||
onCancelRecording,
|
||||
voiceAvailable,
|
||||
ttsAvailable,
|
||||
ttsEnabled,
|
||||
ttsMode,
|
||||
onToggleTts,
|
||||
onTtsModeChange,
|
||||
}: ChatSidebarProps) {
|
||||
const [width, setWidth] = useState(() => getInitialPaneWidth(defaultWidth))
|
||||
const [isResizing, setIsResizing] = useState(false)
|
||||
|
|
@ -542,6 +567,18 @@ export function ChatSidebar({
|
|||
runId={tabState.runId}
|
||||
initialDraft={getInitialDraft?.(tab.id)}
|
||||
onDraftChange={onDraftChangeForTab ? (text) => onDraftChangeForTab(tab.id, text) : undefined}
|
||||
isRecording={isActive && isRecording}
|
||||
recordingText={isActive ? recordingText : undefined}
|
||||
recordingState={isActive ? recordingState : undefined}
|
||||
onStartRecording={isActive ? onStartRecording : undefined}
|
||||
onSubmitRecording={isActive ? onSubmitRecording : undefined}
|
||||
onCancelRecording={isActive ? onCancelRecording : undefined}
|
||||
voiceAvailable={isActive && voiceAvailable}
|
||||
ttsAvailable={isActive && ttsAvailable}
|
||||
ttsEnabled={ttsEnabled}
|
||||
ttsMode={ttsMode}
|
||||
onToggleTts={isActive ? onToggleTts : undefined}
|
||||
onTtsModeChange={isActive ? onTtsModeChange : undefined}
|
||||
/>
|
||||
</div>
|
||||
)
|
||||
|
|
|
|||
218
apps/x/apps/renderer/src/hooks/useVoiceMode.ts
Normal file
218
apps/x/apps/renderer/src/hooks/useVoiceMode.ts
Normal file
|
|
@ -0,0 +1,218 @@
|
|||
import { useCallback, useEffect, useRef, useState } from 'react';
|
||||
|
||||
export type VoiceState = 'idle' | 'connecting' | 'listening';
|
||||
|
||||
// Cache the API key so we skip the IPC call after first use
|
||||
let cachedApiKey: string | null = null;
|
||||
let apiKeyFetched = false;
|
||||
|
||||
export function useVoiceMode() {
|
||||
const [state, setState] = useState<VoiceState>('idle');
|
||||
const [interimText, setInterimText] = useState('');
|
||||
const wsRef = useRef<WebSocket | null>(null);
|
||||
const mediaStreamRef = useRef<MediaStream | null>(null);
|
||||
const processorRef = useRef<ScriptProcessorNode | null>(null);
|
||||
const audioCtxRef = useRef<AudioContext | null>(null);
|
||||
const transcriptBufferRef = useRef('');
|
||||
const interimRef = useRef('');
|
||||
const reconnectTimerRef = useRef<ReturnType<typeof setTimeout> | null>(null);
|
||||
const mountedRef = useRef(true);
|
||||
|
||||
// Connect (or reconnect) the Deepgram WebSocket.
|
||||
// The WS stays open while the hook is mounted; only audio capture starts/stops per recording.
|
||||
const connectWs = useCallback(() => {
|
||||
if (!cachedApiKey) return;
|
||||
if (wsRef.current && (wsRef.current.readyState === WebSocket.OPEN || wsRef.current.readyState === WebSocket.CONNECTING)) return;
|
||||
|
||||
const ws = new WebSocket(
|
||||
`wss://api.deepgram.com/v1/listen?model=nova-3&encoding=linear16&sample_rate=16000&channels=1&interim_results=true&smart_format=true&punctuate=true&language=en`,
|
||||
['token', cachedApiKey]
|
||||
);
|
||||
wsRef.current = ws;
|
||||
|
||||
ws.onopen = () => {
|
||||
console.log('[voice] WebSocket connected');
|
||||
};
|
||||
|
||||
ws.onmessage = (event) => {
|
||||
const data = JSON.parse(event.data);
|
||||
if (!data.channel?.alternatives?.[0]) return;
|
||||
|
||||
const transcript = data.channel.alternatives[0].transcript;
|
||||
if (!transcript) return;
|
||||
|
||||
if (data.is_final) {
|
||||
transcriptBufferRef.current += (transcriptBufferRef.current ? ' ' : '') + transcript;
|
||||
interimRef.current = '';
|
||||
setInterimText(transcriptBufferRef.current);
|
||||
} else {
|
||||
interimRef.current = transcript;
|
||||
setInterimText(transcriptBufferRef.current + (transcriptBufferRef.current ? ' ' : '') + transcript);
|
||||
}
|
||||
};
|
||||
|
||||
ws.onerror = () => {
|
||||
console.error('[voice] WebSocket error');
|
||||
};
|
||||
|
||||
ws.onclose = () => {
|
||||
console.log('[voice] WebSocket closed');
|
||||
wsRef.current = null;
|
||||
// Auto-reconnect after 3 seconds if still mounted
|
||||
if (mountedRef.current && cachedApiKey) {
|
||||
reconnectTimerRef.current = setTimeout(() => {
|
||||
if (mountedRef.current) connectWs();
|
||||
}, 3000);
|
||||
}
|
||||
};
|
||||
}, []);
|
||||
|
||||
// Fetch API key on mount and establish persistent WebSocket
|
||||
useEffect(() => {
|
||||
mountedRef.current = true;
|
||||
|
||||
const init = async () => {
|
||||
if (!apiKeyFetched) {
|
||||
apiKeyFetched = true;
|
||||
try {
|
||||
const config = await window.ipc.invoke('voice:getConfig', null);
|
||||
cachedApiKey = config.deepgram?.apiKey ?? null;
|
||||
} catch { /* ignore */ }
|
||||
}
|
||||
if (cachedApiKey && mountedRef.current) {
|
||||
connectWs();
|
||||
}
|
||||
};
|
||||
void init();
|
||||
|
||||
return () => {
|
||||
mountedRef.current = false;
|
||||
if (reconnectTimerRef.current) {
|
||||
clearTimeout(reconnectTimerRef.current);
|
||||
reconnectTimerRef.current = null;
|
||||
}
|
||||
// Close WS on unmount, suppress reconnect by nulling onclose
|
||||
if (wsRef.current) {
|
||||
wsRef.current.onclose = null;
|
||||
wsRef.current.close();
|
||||
wsRef.current = null;
|
||||
}
|
||||
};
|
||||
}, [connectWs]);
|
||||
|
||||
// Stop only audio capture (mic + processor), leaving WS open
|
||||
const stopAudioCapture = useCallback(() => {
|
||||
if (processorRef.current) {
|
||||
processorRef.current.disconnect();
|
||||
processorRef.current = null;
|
||||
}
|
||||
if (audioCtxRef.current) {
|
||||
audioCtxRef.current.close();
|
||||
audioCtxRef.current = null;
|
||||
}
|
||||
if (mediaStreamRef.current) {
|
||||
mediaStreamRef.current.getTracks().forEach(t => t.stop());
|
||||
mediaStreamRef.current = null;
|
||||
}
|
||||
setInterimText('');
|
||||
transcriptBufferRef.current = '';
|
||||
interimRef.current = '';
|
||||
setState('idle');
|
||||
}, []);
|
||||
|
||||
const start = useCallback(async () => {
|
||||
if (state !== 'idle') return;
|
||||
|
||||
// Ensure we have an API key
|
||||
if (!cachedApiKey) {
|
||||
try {
|
||||
const config = await window.ipc.invoke('voice:getConfig', null);
|
||||
cachedApiKey = config.deepgram?.apiKey ?? null;
|
||||
} catch { /* ignore */ }
|
||||
}
|
||||
if (!cachedApiKey) {
|
||||
console.error('Deepgram not configured');
|
||||
return;
|
||||
}
|
||||
|
||||
transcriptBufferRef.current = '';
|
||||
interimRef.current = '';
|
||||
setInterimText('');
|
||||
|
||||
// If WS isn't connected, connect and wait for it
|
||||
if (!wsRef.current || wsRef.current.readyState !== WebSocket.OPEN) {
|
||||
setState('connecting');
|
||||
connectWs();
|
||||
// Wait for WS to be ready (up to 5 seconds)
|
||||
const wsOk = await new Promise<boolean>((resolve) => {
|
||||
const checkInterval = setInterval(() => {
|
||||
if (wsRef.current?.readyState === WebSocket.OPEN) {
|
||||
clearInterval(checkInterval);
|
||||
resolve(true);
|
||||
}
|
||||
}, 50);
|
||||
setTimeout(() => {
|
||||
clearInterval(checkInterval);
|
||||
resolve(false);
|
||||
}, 5000);
|
||||
});
|
||||
if (!wsOk) {
|
||||
setState('idle');
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
setState('listening');
|
||||
|
||||
// Start mic
|
||||
let stream: MediaStream | null = null;
|
||||
try {
|
||||
stream = await navigator.mediaDevices.getUserMedia({ audio: true });
|
||||
} catch (err) {
|
||||
console.error('Microphone access denied:', err);
|
||||
setState('idle');
|
||||
return;
|
||||
}
|
||||
|
||||
mediaStreamRef.current = stream;
|
||||
|
||||
// Start audio capture
|
||||
const audioCtx = new AudioContext({ sampleRate: 16000 });
|
||||
audioCtxRef.current = audioCtx;
|
||||
const source = audioCtx.createMediaStreamSource(stream);
|
||||
const processor = audioCtx.createScriptProcessor(4096, 1, 1);
|
||||
processorRef.current = processor;
|
||||
|
||||
processor.onaudioprocess = (e) => {
|
||||
if (!wsRef.current || wsRef.current.readyState !== WebSocket.OPEN) return;
|
||||
const float32 = e.inputBuffer.getChannelData(0);
|
||||
const int16 = new Int16Array(float32.length);
|
||||
for (let i = 0; i < float32.length; i++) {
|
||||
const s = Math.max(-1, Math.min(1, float32[i]));
|
||||
int16[i] = s < 0 ? s * 0x8000 : s * 0x7fff;
|
||||
}
|
||||
wsRef.current.send(int16.buffer);
|
||||
};
|
||||
|
||||
source.connect(processor);
|
||||
processor.connect(audioCtx.destination);
|
||||
}, [state, connectWs]);
|
||||
|
||||
/** Stop recording and return the full transcript (finalized + any current interim) */
|
||||
const submit = useCallback((): string => {
|
||||
let text = transcriptBufferRef.current;
|
||||
if (interimRef.current) {
|
||||
text += (text ? ' ' : '') + interimRef.current;
|
||||
}
|
||||
text = text.trim();
|
||||
stopAudioCapture();
|
||||
return text;
|
||||
}, [stopAudioCapture]);
|
||||
|
||||
/** Cancel recording without returning transcript */
|
||||
const cancel = useCallback(() => {
|
||||
stopAudioCapture();
|
||||
}, [stopAudioCapture]);
|
||||
|
||||
return { state, interimText, start, submit, cancel };
|
||||
}
|
||||
72
apps/x/apps/renderer/src/hooks/useVoiceTTS.ts
Normal file
72
apps/x/apps/renderer/src/hooks/useVoiceTTS.ts
Normal file
|
|
@ -0,0 +1,72 @@
|
|||
import { useCallback, useRef, useState } from 'react';
|
||||
|
||||
export type TTSState = 'idle' | 'synthesizing' | 'speaking';
|
||||
|
||||
export function useVoiceTTS() {
|
||||
const [state, setState] = useState<TTSState>('idle');
|
||||
const audioRef = useRef<HTMLAudioElement | null>(null);
|
||||
const queueRef = useRef<string[]>([]);
|
||||
const processingRef = useRef(false);
|
||||
|
||||
const processQueue = useCallback(async () => {
|
||||
if (processingRef.current) return;
|
||||
processingRef.current = true;
|
||||
|
||||
while (queueRef.current.length > 0) {
|
||||
const text = queueRef.current.shift()!;
|
||||
if (!text.trim()) continue;
|
||||
|
||||
setState('synthesizing');
|
||||
console.log('[tts] synthesizing:', text.substring(0, 80));
|
||||
try {
|
||||
const result = await window.ipc.invoke('voice:synthesize', { text });
|
||||
console.log('[tts] got audio, mimeType:', result.mimeType, 'base64 length:', result.audioBase64.length);
|
||||
setState('speaking');
|
||||
|
||||
await new Promise<void>((resolve, reject) => {
|
||||
const dataUrl = `data:${result.mimeType};base64,${result.audioBase64}`;
|
||||
const audio = new Audio(dataUrl);
|
||||
audioRef.current = audio;
|
||||
audio.onended = () => {
|
||||
console.log('[tts] audio ended');
|
||||
resolve();
|
||||
};
|
||||
audio.onerror = (e) => {
|
||||
console.error('[tts] audio error:', e);
|
||||
reject(new Error('Audio playback failed'));
|
||||
};
|
||||
audio.play().then(() => {
|
||||
console.log('[tts] audio playing');
|
||||
}).catch((err) => {
|
||||
console.error('[tts] play() rejected:', err);
|
||||
reject(err);
|
||||
});
|
||||
});
|
||||
} catch (err) {
|
||||
console.error('[tts] error:', err);
|
||||
}
|
||||
}
|
||||
|
||||
audioRef.current = null;
|
||||
processingRef.current = false;
|
||||
setState('idle');
|
||||
}, []);
|
||||
|
||||
const speak = useCallback((text: string) => {
|
||||
console.log('[tts] speak() called:', text.substring(0, 80));
|
||||
queueRef.current.push(text);
|
||||
processQueue();
|
||||
}, [processQueue]);
|
||||
|
||||
const cancel = useCallback(() => {
|
||||
queueRef.current = [];
|
||||
if (audioRef.current) {
|
||||
audioRef.current.pause();
|
||||
audioRef.current = null;
|
||||
}
|
||||
processingRef.current = false;
|
||||
setState('idle');
|
||||
}, []);
|
||||
|
||||
return { state, speak, cancel };
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue