From ca9d5761d309eecb3083cb62304206388e788095 Mon Sep 17 00:00:00 2001 From: Arjun <6592213+arkml@users.noreply.github.com> Date: Tue, 17 Mar 2026 10:18:23 +0530 Subject: [PATCH] meeting transcription first commit --- apps/x/apps/main/entitlements.plist | 10 + apps/x/apps/main/forge.config.cjs | 4 + apps/x/apps/main/src/main.ts | 18 +- apps/x/apps/renderer/src/App.tsx | 44 ++- .../src/hooks/useMeetingTranscription.ts | 269 ++++++++++++++++++ 5 files changed, 341 insertions(+), 4 deletions(-) create mode 100644 apps/x/apps/main/entitlements.plist create mode 100644 apps/x/apps/renderer/src/hooks/useMeetingTranscription.ts diff --git a/apps/x/apps/main/entitlements.plist b/apps/x/apps/main/entitlements.plist new file mode 100644 index 00000000..db2dbd7e --- /dev/null +++ b/apps/x/apps/main/entitlements.plist @@ -0,0 +1,10 @@ + + + + + com.apple.security.device.audio-input + + com.apple.security.device.screen-capture + + + diff --git a/apps/x/apps/main/forge.config.cjs b/apps/x/apps/main/forge.config.cjs index 57f733f2..c79a8c43 100644 --- a/apps/x/apps/main/forge.config.cjs +++ b/apps/x/apps/main/forge.config.cjs @@ -13,6 +13,10 @@ module.exports = { appCategoryType: 'public.app-category.productivity', osxSign: { batchCodesignCalls: true, + optionsForFile: () => ({ + entitlements: path.join(__dirname, 'entitlements.plist'), + 'entitlements-inherit': path.join(__dirname, 'entitlements.plist'), + }), }, osxNotarize: { appleId: process.env.APPLE_ID, diff --git a/apps/x/apps/main/src/main.ts b/apps/x/apps/main/src/main.ts index 579fdbfa..060f0433 100644 --- a/apps/x/apps/main/src/main.ts +++ b/apps/x/apps/main/src/main.ts @@ -1,4 +1,4 @@ -import { app, BrowserWindow, protocol, net, shell, session } from "electron"; +import { app, BrowserWindow, desktopCapturer, protocol, net, shell, session } from "electron"; import path from "node:path"; import { setupIpcHandlers, @@ -92,15 +92,27 @@ function createWindow() { }, }); - // Grant microphone permission for voice mode + // Grant microphone and display-capture permissions session.defaultSession.setPermissionRequestHandler((_webContents, permission, callback) => { - if (permission === 'media') { + if (permission === 'media' || permission === 'display-capture') { callback(true); } else { callback(false); } }); + // Auto-approve display media requests and route system audio as loopback. + // Electron requires a video source in the callback even if we only want audio. + // We pass the first available screen source; the renderer discards the video track. + session.defaultSession.setDisplayMediaRequestHandler(async (_request, callback) => { + const sources = await desktopCapturer.getSources({ types: ['screen'] }); + if (sources.length === 0) { + callback({}); + return; + } + callback({ video: sources[0], audio: 'loopback' }); + }); + // Show window when content is ready to prevent blank screen win.once("ready-to-show", () => { win.maximize(); diff --git a/apps/x/apps/renderer/src/App.tsx b/apps/x/apps/renderer/src/App.tsx index a92f2d28..fdea8bac 100644 --- a/apps/x/apps/renderer/src/App.tsx +++ b/apps/x/apps/renderer/src/App.tsx @@ -5,7 +5,7 @@ import { RunEvent, ListRunsResponse } from '@x/shared/src/runs.js'; import type { LanguageModelUsage, ToolUIPart } from 'ai'; import './App.css' import z from 'zod'; -import { CheckIcon, LoaderIcon, PanelLeftIcon, Maximize2, Minimize2, ChevronLeftIcon, ChevronRightIcon, SquarePen, SearchIcon, HistoryIcon } from 'lucide-react'; +import { CheckIcon, LoaderIcon, PanelLeftIcon, Maximize2, Minimize2, ChevronLeftIcon, ChevronRightIcon, SquarePen, SearchIcon, HistoryIcon, RadioIcon, SquareIcon } from 'lucide-react'; import { cn } from '@/lib/utils'; import { MarkdownEditor } from './components/markdown-editor'; import { ChatSidebar } from './components/chat-sidebar'; @@ -78,6 +78,7 @@ import { AgentScheduleState } from '@x/shared/dist/agent-schedule-state.js' import { toast } from "sonner" import { useVoiceMode } from '@/hooks/useVoiceMode' import { useVoiceTTS } from '@/hooks/useVoiceTTS' +import { useMeetingTranscription, type MeetingTranscriptionState } from '@/hooks/useMeetingTranscription' type DirEntry = z.infer type RunEventType = z.infer @@ -383,6 +384,8 @@ function FixedSidebarToggle({ canNavigateForward, onNewChat, onOpenSearch, + meetingState, + onToggleMeeting, leftInsetPx, }: { onNavigateBack: () => void @@ -391,6 +394,8 @@ function FixedSidebarToggle({ canNavigateForward: boolean onNewChat: () => void onOpenSearch: () => void + meetingState: MeetingTranscriptionState + onToggleMeeting: () => void leftInsetPx: number }) { const { toggleSidebar, state } = useSidebar() @@ -426,6 +431,25 @@ function FixedSidebarToggle({ > + {/* Back / Forward navigation */} {isCollapsed && ( <> @@ -619,6 +643,11 @@ function App() { const voiceRef = useRef(voice) voiceRef.current = voice + const handleToggleMeetingRef = useRef<(() => void) | undefined>(undefined) + const meetingTranscription = useMeetingTranscription(() => { + handleToggleMeetingRef.current?.() + }) + // Check if voice is available on mount and when OAuth state changes const refreshVoiceAvailability = useCallback(() => { Promise.all([ @@ -3314,6 +3343,17 @@ function App() { navigateToFile(notePath) }, [loadDirectory, navigateToFile, fileTabs]) + const handleToggleMeeting = useCallback(async () => { + if (meetingTranscription.state === 'recording') { + await meetingTranscription.stop() + } else if (meetingTranscription.state === 'idle') { + const notePath = await meetingTranscription.start() + if (notePath) { + await handleVoiceNoteCreated(notePath) + } + } + }, [meetingTranscription, handleVoiceNoteCreated]) + const ensureWikiFile = useCallback(async (wikiPath: string) => { const resolvedPath = toKnowledgePath(wikiPath) if (!resolvedPath) return null @@ -4175,6 +4215,8 @@ function App() { canNavigateForward={canNavigateForward} onNewChat={handleNewChatTab} onOpenSearch={() => setIsSearchOpen(true)} + meetingState={meetingTranscription.state} + onToggleMeeting={() => { void handleToggleMeeting() }} leftInsetPx={isMac ? MACOS_TRAFFIC_LIGHTS_RESERVED_PX : 0} /> diff --git a/apps/x/apps/renderer/src/hooks/useMeetingTranscription.ts b/apps/x/apps/renderer/src/hooks/useMeetingTranscription.ts new file mode 100644 index 00000000..103cfe74 --- /dev/null +++ b/apps/x/apps/renderer/src/hooks/useMeetingTranscription.ts @@ -0,0 +1,269 @@ +import { useCallback, useRef, useState } from 'react'; + +export type MeetingTranscriptionState = 'idle' | 'connecting' | 'recording' | 'stopping'; + +const DEEPGRAM_PARAMS = new URLSearchParams({ + model: 'nova-3', + encoding: 'linear16', + sample_rate: '16000', + channels: '2', + multichannel: 'true', + interim_results: 'true', + smart_format: 'true', + punctuate: 'true', +}); +const DEEPGRAM_LISTEN_URL = `wss://api.deepgram.com/v1/listen?${DEEPGRAM_PARAMS.toString()}`; + +interface TranscriptEntry { + speaker: string; + text: string; +} + +function formatTranscript(entries: TranscriptEntry[], date: string): string { + const lines = [ + '---', + 'type: meeting', + 'source: rowboat', + 'title: Meeting Transcription', + `date: "${date}"`, + '---', + '', + '# Meeting Transcription', + '', + ]; + for (const entry of entries) { + lines.push(`**${entry.speaker}:** ${entry.text}`); + lines.push(''); + } + return lines.join('\n'); +} + +export function useMeetingTranscription() { + const [state, setState] = useState('idle'); + const wsRef = useRef(null); + const micStreamRef = useRef(null); + const systemStreamRef = useRef(null); + const processorRef = useRef(null); + const audioCtxRef = useRef(null); + const transcriptRef = useRef([]); + const notePathRef = useRef(''); + const writeTimerRef = useRef | null>(null); + const dateRef = useRef(''); + + const writeTranscriptToFile = useCallback(async () => { + if (!notePathRef.current || transcriptRef.current.length === 0) return; + const content = formatTranscript(transcriptRef.current, dateRef.current); + try { + await window.ipc.invoke('workspace:writeFile', { + path: notePathRef.current, + data: content, + opts: { encoding: 'utf8' }, + }); + } catch (err) { + console.error('[meeting] Failed to write transcript:', err); + } + }, []); + + const scheduleDebouncedWrite = useCallback(() => { + if (writeTimerRef.current) clearTimeout(writeTimerRef.current); + writeTimerRef.current = setTimeout(() => { + void writeTranscriptToFile(); + }, 5000); + }, [writeTranscriptToFile]); + + const cleanup = useCallback(() => { + if (writeTimerRef.current) { + clearTimeout(writeTimerRef.current); + writeTimerRef.current = null; + } + if (processorRef.current) { + processorRef.current.disconnect(); + processorRef.current = null; + } + if (audioCtxRef.current) { + audioCtxRef.current.close(); + audioCtxRef.current = null; + } + if (micStreamRef.current) { + micStreamRef.current.getTracks().forEach(t => t.stop()); + micStreamRef.current = null; + } + if (systemStreamRef.current) { + systemStreamRef.current.getTracks().forEach(t => t.stop()); + systemStreamRef.current = null; + } + if (wsRef.current) { + wsRef.current.onclose = null; + wsRef.current.close(); + wsRef.current = null; + } + }, []); + + const start = useCallback(async (): Promise => { + if (state !== 'idle') return null; + setState('connecting'); + + // Get Deepgram token + let ws: WebSocket; + try { + const result = await window.ipc.invoke('voice:getDeepgramToken', null); + if (result) { + console.log('[meeting] Using proxy token'); + ws = new WebSocket(DEEPGRAM_LISTEN_URL, ['bearer', result.token]); + } else { + const config = await window.ipc.invoke('voice:getConfig', null); + if (!config?.deepgram) { + console.error('[meeting] No Deepgram config available'); + setState('idle'); + return null; + } + console.log('[meeting] Using API key'); + ws = new WebSocket(DEEPGRAM_LISTEN_URL, ['token', config.deepgram.apiKey]); + } + } catch (err) { + console.error('[meeting] Failed to get Deepgram token:', err); + setState('idle'); + return null; + } + wsRef.current = ws; + + // Wait for WS open + const wsOk = await new Promise((resolve) => { + ws.onopen = () => resolve(true); + ws.onerror = () => resolve(false); + setTimeout(() => resolve(false), 5000); + }); + if (!wsOk) { + console.error('[meeting] WebSocket failed to connect'); + cleanup(); + setState('idle'); + return null; + } + console.log('[meeting] WebSocket connected'); + + // Set up WS message handler + transcriptRef.current = []; + ws.onmessage = (event) => { + const data = JSON.parse(event.data); + if (!data.channel?.alternatives?.[0]) return; + const transcript = data.channel.alternatives[0].transcript; + if (!transcript || !data.is_final) return; + + const channelIndex = data.channel_index?.[0] ?? 0; + const speaker = channelIndex === 0 ? 'You' : 'Speaker'; + + // Merge with last entry if same speaker + const entries = transcriptRef.current; + if (entries.length > 0 && entries[entries.length - 1].speaker === speaker) { + entries[entries.length - 1].text += ' ' + transcript; + } else { + entries.push({ speaker, text: transcript }); + } + scheduleDebouncedWrite(); + }; + + ws.onclose = () => { + console.log('[meeting] WebSocket closed'); + wsRef.current = null; + }; + + // Get mic stream + let micStream: MediaStream; + try { + micStream = await navigator.mediaDevices.getUserMedia({ audio: true }); + } catch (err) { + console.error('[meeting] Microphone access denied:', err); + cleanup(); + setState('idle'); + return null; + } + micStreamRef.current = micStream; + + // Get system audio via getDisplayMedia + // The main process setDisplayMediaRequestHandler auto-approves with loopback audio + let systemStream: MediaStream; + try { + systemStream = await navigator.mediaDevices.getDisplayMedia({ audio: true, video: true }); + // Stop any video tracks — we only need audio + systemStream.getVideoTracks().forEach(t => t.stop()); + } catch (err) { + console.error('[meeting] System audio access denied:', err); + cleanup(); + setState('idle'); + return null; + } + if (systemStream.getAudioTracks().length === 0) { + console.error('[meeting] No audio track from getDisplayMedia'); + systemStream.getTracks().forEach(t => t.stop()); + cleanup(); + setState('idle'); + return null; + } + console.log('[meeting] System audio captured'); + systemStreamRef.current = systemStream; + + // Set up AudioContext with channel merger + const audioCtx = new AudioContext({ sampleRate: 16000 }); + audioCtxRef.current = audioCtx; + + const micSource = audioCtx.createMediaStreamSource(micStream); + const systemSource = audioCtx.createMediaStreamSource(systemStream); + const merger = audioCtx.createChannelMerger(2); + + micSource.connect(merger, 0, 0); // mic → channel 0 + systemSource.connect(merger, 0, 1); // system audio → channel 1 + + const processor = audioCtx.createScriptProcessor(4096, 2, 2); + processorRef.current = processor; + + processor.onaudioprocess = (e) => { + if (!wsRef.current || wsRef.current.readyState !== WebSocket.OPEN) return; + const ch0 = e.inputBuffer.getChannelData(0); + const ch1 = e.inputBuffer.getChannelData(1); + // Interleave 2 channels into stereo int16 PCM + const int16 = new Int16Array(ch0.length * 2); + for (let i = 0; i < ch0.length; i++) { + const s0 = Math.max(-1, Math.min(1, ch0[i])); + const s1 = Math.max(-1, Math.min(1, ch1[i])); + int16[i * 2] = s0 < 0 ? s0 * 0x8000 : s0 * 0x7fff; + int16[i * 2 + 1] = s1 < 0 ? s1 * 0x8000 : s1 * 0x7fff; + } + wsRef.current.send(int16.buffer); + }; + + merger.connect(processor); + processor.connect(audioCtx.destination); + + // Create the note file + const now = new Date(); + const dateStr = now.toISOString(); + dateRef.current = dateStr; + const timestamp = dateStr.replace(/:/g, '-').replace(/\.\d+Z$/, ''); + const notePath = `knowledge/Meetings/rowboat/meeting-${timestamp}.md`; + notePathRef.current = notePath; + + const initialContent = formatTranscript([], dateStr); + await window.ipc.invoke('workspace:writeFile', { + path: notePath, + data: initialContent, + opts: { encoding: 'utf8', mkdirp: true }, + }); + + setState('recording'); + return notePath; + }, [state, cleanup, scheduleDebouncedWrite]); + + const stop = useCallback(async () => { + if (state !== 'recording') return; + setState('stopping'); + + cleanup(); + + // Write final transcript + await writeTranscriptToFile(); + + setState('idle'); + }, [state, cleanup, writeTranscriptToFile]); + + return { state, start, stop }; +}