meeting transcription first commit

This commit is contained in:
Arjun 2026-03-17 10:18:23 +05:30
parent 128f433e5c
commit ca9d5761d3
5 changed files with 341 additions and 4 deletions

View file

@ -0,0 +1,10 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
<plist version="1.0">
<dict>
<key>com.apple.security.device.audio-input</key>
<true/>
<key>com.apple.security.device.screen-capture</key>
<true/>
</dict>
</plist>

View file

@ -13,6 +13,10 @@ module.exports = {
appCategoryType: 'public.app-category.productivity',
osxSign: {
batchCodesignCalls: true,
optionsForFile: () => ({
entitlements: path.join(__dirname, 'entitlements.plist'),
'entitlements-inherit': path.join(__dirname, 'entitlements.plist'),
}),
},
osxNotarize: {
appleId: process.env.APPLE_ID,

View file

@ -1,4 +1,4 @@
import { app, BrowserWindow, protocol, net, shell, session } from "electron";
import { app, BrowserWindow, desktopCapturer, protocol, net, shell, session } from "electron";
import path from "node:path";
import {
setupIpcHandlers,
@ -92,15 +92,27 @@ function createWindow() {
},
});
// Grant microphone permission for voice mode
// Grant microphone and display-capture permissions
session.defaultSession.setPermissionRequestHandler((_webContents, permission, callback) => {
if (permission === 'media') {
if (permission === 'media' || permission === 'display-capture') {
callback(true);
} else {
callback(false);
}
});
// Auto-approve display media requests and route system audio as loopback.
// Electron requires a video source in the callback even if we only want audio.
// We pass the first available screen source; the renderer discards the video track.
session.defaultSession.setDisplayMediaRequestHandler(async (_request, callback) => {
const sources = await desktopCapturer.getSources({ types: ['screen'] });
if (sources.length === 0) {
callback({});
return;
}
callback({ video: sources[0], audio: 'loopback' });
});
// Show window when content is ready to prevent blank screen
win.once("ready-to-show", () => {
win.maximize();

View file

@ -5,7 +5,7 @@ import { RunEvent, ListRunsResponse } from '@x/shared/src/runs.js';
import type { LanguageModelUsage, ToolUIPart } from 'ai';
import './App.css'
import z from 'zod';
import { CheckIcon, LoaderIcon, PanelLeftIcon, Maximize2, Minimize2, ChevronLeftIcon, ChevronRightIcon, SquarePen, SearchIcon, HistoryIcon } from 'lucide-react';
import { CheckIcon, LoaderIcon, PanelLeftIcon, Maximize2, Minimize2, ChevronLeftIcon, ChevronRightIcon, SquarePen, SearchIcon, HistoryIcon, RadioIcon, SquareIcon } from 'lucide-react';
import { cn } from '@/lib/utils';
import { MarkdownEditor } from './components/markdown-editor';
import { ChatSidebar } from './components/chat-sidebar';
@ -78,6 +78,7 @@ import { AgentScheduleState } from '@x/shared/dist/agent-schedule-state.js'
import { toast } from "sonner"
import { useVoiceMode } from '@/hooks/useVoiceMode'
import { useVoiceTTS } from '@/hooks/useVoiceTTS'
import { useMeetingTranscription, type MeetingTranscriptionState } from '@/hooks/useMeetingTranscription'
type DirEntry = z.infer<typeof workspace.DirEntry>
type RunEventType = z.infer<typeof RunEvent>
@ -383,6 +384,8 @@ function FixedSidebarToggle({
canNavigateForward,
onNewChat,
onOpenSearch,
meetingState,
onToggleMeeting,
leftInsetPx,
}: {
onNavigateBack: () => void
@ -391,6 +394,8 @@ function FixedSidebarToggle({
canNavigateForward: boolean
onNewChat: () => void
onOpenSearch: () => void
meetingState: MeetingTranscriptionState
onToggleMeeting: () => void
leftInsetPx: number
}) {
const { toggleSidebar, state } = useSidebar()
@ -426,6 +431,25 @@ function FixedSidebarToggle({
>
<SearchIcon className="size-5" />
</button>
<button
type="button"
onClick={onToggleMeeting}
disabled={meetingState === 'connecting' || meetingState === 'stopping'}
className={cn(
"flex h-8 w-8 items-center justify-center rounded-md transition-colors disabled:opacity-50 disabled:pointer-events-none",
meetingState === 'recording'
? "text-red-500 hover:bg-accent"
: "text-muted-foreground hover:bg-accent hover:text-foreground"
)}
style={{ marginLeft: TITLEBAR_BUTTON_GAP_PX }}
aria-label={meetingState === 'recording' ? "Stop meeting transcription" : "Start meeting transcription"}
>
{meetingState === 'recording' ? (
<SquareIcon className="size-4 animate-pulse" />
) : (
<RadioIcon className="size-5" />
)}
</button>
{/* Back / Forward navigation */}
{isCollapsed && (
<>
@ -619,6 +643,11 @@ function App() {
const voiceRef = useRef(voice)
voiceRef.current = voice
const handleToggleMeetingRef = useRef<(() => void) | undefined>(undefined)
const meetingTranscription = useMeetingTranscription(() => {
handleToggleMeetingRef.current?.()
})
// Check if voice is available on mount and when OAuth state changes
const refreshVoiceAvailability = useCallback(() => {
Promise.all([
@ -3314,6 +3343,17 @@ function App() {
navigateToFile(notePath)
}, [loadDirectory, navigateToFile, fileTabs])
const handleToggleMeeting = useCallback(async () => {
if (meetingTranscription.state === 'recording') {
await meetingTranscription.stop()
} else if (meetingTranscription.state === 'idle') {
const notePath = await meetingTranscription.start()
if (notePath) {
await handleVoiceNoteCreated(notePath)
}
}
}, [meetingTranscription, handleVoiceNoteCreated])
const ensureWikiFile = useCallback(async (wikiPath: string) => {
const resolvedPath = toKnowledgePath(wikiPath)
if (!resolvedPath) return null
@ -4175,6 +4215,8 @@ function App() {
canNavigateForward={canNavigateForward}
onNewChat={handleNewChatTab}
onOpenSearch={() => setIsSearchOpen(true)}
meetingState={meetingTranscription.state}
onToggleMeeting={() => { void handleToggleMeeting() }}
leftInsetPx={isMac ? MACOS_TRAFFIC_LIGHTS_RESERVED_PX : 0}
/>
</SidebarProvider>

View file

@ -0,0 +1,269 @@
import { useCallback, useRef, useState } from 'react';
export type MeetingTranscriptionState = 'idle' | 'connecting' | 'recording' | 'stopping';
const DEEPGRAM_PARAMS = new URLSearchParams({
model: 'nova-3',
encoding: 'linear16',
sample_rate: '16000',
channels: '2',
multichannel: 'true',
interim_results: 'true',
smart_format: 'true',
punctuate: 'true',
});
const DEEPGRAM_LISTEN_URL = `wss://api.deepgram.com/v1/listen?${DEEPGRAM_PARAMS.toString()}`;
interface TranscriptEntry {
speaker: string;
text: string;
}
function formatTranscript(entries: TranscriptEntry[], date: string): string {
const lines = [
'---',
'type: meeting',
'source: rowboat',
'title: Meeting Transcription',
`date: "${date}"`,
'---',
'',
'# Meeting Transcription',
'',
];
for (const entry of entries) {
lines.push(`**${entry.speaker}:** ${entry.text}`);
lines.push('');
}
return lines.join('\n');
}
export function useMeetingTranscription() {
const [state, setState] = useState<MeetingTranscriptionState>('idle');
const wsRef = useRef<WebSocket | null>(null);
const micStreamRef = useRef<MediaStream | null>(null);
const systemStreamRef = useRef<MediaStream | null>(null);
const processorRef = useRef<ScriptProcessorNode | null>(null);
const audioCtxRef = useRef<AudioContext | null>(null);
const transcriptRef = useRef<TranscriptEntry[]>([]);
const notePathRef = useRef<string>('');
const writeTimerRef = useRef<ReturnType<typeof setTimeout> | null>(null);
const dateRef = useRef<string>('');
const writeTranscriptToFile = useCallback(async () => {
if (!notePathRef.current || transcriptRef.current.length === 0) return;
const content = formatTranscript(transcriptRef.current, dateRef.current);
try {
await window.ipc.invoke('workspace:writeFile', {
path: notePathRef.current,
data: content,
opts: { encoding: 'utf8' },
});
} catch (err) {
console.error('[meeting] Failed to write transcript:', err);
}
}, []);
const scheduleDebouncedWrite = useCallback(() => {
if (writeTimerRef.current) clearTimeout(writeTimerRef.current);
writeTimerRef.current = setTimeout(() => {
void writeTranscriptToFile();
}, 5000);
}, [writeTranscriptToFile]);
const cleanup = useCallback(() => {
if (writeTimerRef.current) {
clearTimeout(writeTimerRef.current);
writeTimerRef.current = null;
}
if (processorRef.current) {
processorRef.current.disconnect();
processorRef.current = null;
}
if (audioCtxRef.current) {
audioCtxRef.current.close();
audioCtxRef.current = null;
}
if (micStreamRef.current) {
micStreamRef.current.getTracks().forEach(t => t.stop());
micStreamRef.current = null;
}
if (systemStreamRef.current) {
systemStreamRef.current.getTracks().forEach(t => t.stop());
systemStreamRef.current = null;
}
if (wsRef.current) {
wsRef.current.onclose = null;
wsRef.current.close();
wsRef.current = null;
}
}, []);
const start = useCallback(async (): Promise<string | null> => {
if (state !== 'idle') return null;
setState('connecting');
// Get Deepgram token
let ws: WebSocket;
try {
const result = await window.ipc.invoke('voice:getDeepgramToken', null);
if (result) {
console.log('[meeting] Using proxy token');
ws = new WebSocket(DEEPGRAM_LISTEN_URL, ['bearer', result.token]);
} else {
const config = await window.ipc.invoke('voice:getConfig', null);
if (!config?.deepgram) {
console.error('[meeting] No Deepgram config available');
setState('idle');
return null;
}
console.log('[meeting] Using API key');
ws = new WebSocket(DEEPGRAM_LISTEN_URL, ['token', config.deepgram.apiKey]);
}
} catch (err) {
console.error('[meeting] Failed to get Deepgram token:', err);
setState('idle');
return null;
}
wsRef.current = ws;
// Wait for WS open
const wsOk = await new Promise<boolean>((resolve) => {
ws.onopen = () => resolve(true);
ws.onerror = () => resolve(false);
setTimeout(() => resolve(false), 5000);
});
if (!wsOk) {
console.error('[meeting] WebSocket failed to connect');
cleanup();
setState('idle');
return null;
}
console.log('[meeting] WebSocket connected');
// Set up WS message handler
transcriptRef.current = [];
ws.onmessage = (event) => {
const data = JSON.parse(event.data);
if (!data.channel?.alternatives?.[0]) return;
const transcript = data.channel.alternatives[0].transcript;
if (!transcript || !data.is_final) return;
const channelIndex = data.channel_index?.[0] ?? 0;
const speaker = channelIndex === 0 ? 'You' : 'Speaker';
// Merge with last entry if same speaker
const entries = transcriptRef.current;
if (entries.length > 0 && entries[entries.length - 1].speaker === speaker) {
entries[entries.length - 1].text += ' ' + transcript;
} else {
entries.push({ speaker, text: transcript });
}
scheduleDebouncedWrite();
};
ws.onclose = () => {
console.log('[meeting] WebSocket closed');
wsRef.current = null;
};
// Get mic stream
let micStream: MediaStream;
try {
micStream = await navigator.mediaDevices.getUserMedia({ audio: true });
} catch (err) {
console.error('[meeting] Microphone access denied:', err);
cleanup();
setState('idle');
return null;
}
micStreamRef.current = micStream;
// Get system audio via getDisplayMedia
// The main process setDisplayMediaRequestHandler auto-approves with loopback audio
let systemStream: MediaStream;
try {
systemStream = await navigator.mediaDevices.getDisplayMedia({ audio: true, video: true });
// Stop any video tracks — we only need audio
systemStream.getVideoTracks().forEach(t => t.stop());
} catch (err) {
console.error('[meeting] System audio access denied:', err);
cleanup();
setState('idle');
return null;
}
if (systemStream.getAudioTracks().length === 0) {
console.error('[meeting] No audio track from getDisplayMedia');
systemStream.getTracks().forEach(t => t.stop());
cleanup();
setState('idle');
return null;
}
console.log('[meeting] System audio captured');
systemStreamRef.current = systemStream;
// Set up AudioContext with channel merger
const audioCtx = new AudioContext({ sampleRate: 16000 });
audioCtxRef.current = audioCtx;
const micSource = audioCtx.createMediaStreamSource(micStream);
const systemSource = audioCtx.createMediaStreamSource(systemStream);
const merger = audioCtx.createChannelMerger(2);
micSource.connect(merger, 0, 0); // mic → channel 0
systemSource.connect(merger, 0, 1); // system audio → channel 1
const processor = audioCtx.createScriptProcessor(4096, 2, 2);
processorRef.current = processor;
processor.onaudioprocess = (e) => {
if (!wsRef.current || wsRef.current.readyState !== WebSocket.OPEN) return;
const ch0 = e.inputBuffer.getChannelData(0);
const ch1 = e.inputBuffer.getChannelData(1);
// Interleave 2 channels into stereo int16 PCM
const int16 = new Int16Array(ch0.length * 2);
for (let i = 0; i < ch0.length; i++) {
const s0 = Math.max(-1, Math.min(1, ch0[i]));
const s1 = Math.max(-1, Math.min(1, ch1[i]));
int16[i * 2] = s0 < 0 ? s0 * 0x8000 : s0 * 0x7fff;
int16[i * 2 + 1] = s1 < 0 ? s1 * 0x8000 : s1 * 0x7fff;
}
wsRef.current.send(int16.buffer);
};
merger.connect(processor);
processor.connect(audioCtx.destination);
// Create the note file
const now = new Date();
const dateStr = now.toISOString();
dateRef.current = dateStr;
const timestamp = dateStr.replace(/:/g, '-').replace(/\.\d+Z$/, '');
const notePath = `knowledge/Meetings/rowboat/meeting-${timestamp}.md`;
notePathRef.current = notePath;
const initialContent = formatTranscript([], dateStr);
await window.ipc.invoke('workspace:writeFile', {
path: notePath,
data: initialContent,
opts: { encoding: 'utf8', mkdirp: true },
});
setState('recording');
return notePath;
}, [state, cleanup, scheduleDebouncedWrite]);
const stop = useCallback(async () => {
if (state !== 'recording') return;
setState('stopping');
cleanup();
// Write final transcript
await writeTranscriptToFile();
setState('idle');
}, [state, cleanup, writeTranscriptToFile]);
return { state, start, stop };
}