From 537ca08fe5f80a26dbb72a1954b7f9b7668747d1 Mon Sep 17 00:00:00 2001 From: Arjun <6592213+arkml@users.noreply.github.com> Date: Tue, 17 Mar 2026 16:43:33 +0530 Subject: [PATCH] diarization and mic muting --- .../src/hooks/useMeetingTranscription.ts | 103 ++++++++++++++---- 1 file changed, 81 insertions(+), 22 deletions(-) diff --git a/apps/x/apps/renderer/src/hooks/useMeetingTranscription.ts b/apps/x/apps/renderer/src/hooks/useMeetingTranscription.ts index 5747e9bc..304264b8 100644 --- a/apps/x/apps/renderer/src/hooks/useMeetingTranscription.ts +++ b/apps/x/apps/renderer/src/hooks/useMeetingTranscription.ts @@ -8,12 +8,36 @@ const DEEPGRAM_PARAMS = new URLSearchParams({ sample_rate: '16000', channels: '2', multichannel: 'true', + diarize: 'true', interim_results: 'true', smart_format: 'true', punctuate: 'true', }); const DEEPGRAM_LISTEN_URL = `wss://api.deepgram.com/v1/listen?${DEEPGRAM_PARAMS.toString()}`; +// RMS threshold: system audio above this = "active" (speakers playing) +const SYSTEM_AUDIO_GATE_THRESHOLD = 0.005; + +// --------------------------------------------------------------------------- +// Headphone detection +// --------------------------------------------------------------------------- +async function detectHeadphones(): Promise { + try { + const devices = await navigator.mediaDevices.enumerateDevices(); + const outputs = devices.filter(d => d.kind === 'audiooutput'); + const defaultOutput = outputs.find(d => d.deviceId === 'default'); + const label = (defaultOutput?.label ?? '').toLowerCase(); + // Heuristic: built-in speakers won't match these patterns + const headphonePatterns = ['headphone', 'airpod', 'earpod', 'earphone', 'earbud', 'bluetooth', 'bt_', 'jabra', 'bose', 'sony wh', 'sony wf']; + return headphonePatterns.some(p => label.includes(p)); + } catch { + return false; + } +} + +// --------------------------------------------------------------------------- +// Transcript formatting +// --------------------------------------------------------------------------- interface TranscriptEntry { speaker: string; text: string; @@ -32,7 +56,6 @@ function formatTranscript(entries: TranscriptEntry[], date: string): string { '', ]; for (let i = 0; i < entries.length; i++) { - // Add extra blank line between different speakers if (i > 0 && entries[i].speaker !== entries[i - 1].speaker) { lines.push(''); } @@ -42,6 +65,9 @@ function formatTranscript(entries: TranscriptEntry[], date: string): string { return lines.join('\n'); } +// --------------------------------------------------------------------------- +// Hook +// --------------------------------------------------------------------------- export function useMeetingTranscription() { const [state, setState] = useState('idle'); const wsRef = useRef(null); @@ -57,7 +83,6 @@ export function useMeetingTranscription() { const writeTranscriptToFile = useCallback(async () => { if (!notePathRef.current) return; - // Combine finalized entries with any in-progress interim text const entries = [...transcriptRef.current]; for (const interim of interimRef.current.values()) { if (!interim.text) continue; @@ -119,6 +144,10 @@ export function useMeetingTranscription() { if (state !== 'idle') return null; setState('connecting'); + // Detect headphones vs speakers + const usingHeadphones = await detectHeadphones(); + console.log(`[meeting] Audio output mode: ${usingHeadphones ? 'headphones' : 'speakers'}`); + // Get Deepgram token let ws: WebSocket; try { @@ -167,12 +196,21 @@ export function useMeetingTranscription() { if (!transcript) return; const channelIndex = data.channel_index?.[0] ?? 0; - const speaker = channelIndex === 0 ? 'You' : 'System audio'; + const isMic = channelIndex === 0; + + // Channel 0 = mic = "You", Channel 1 = system audio with diarization + let speaker: string; + if (isMic) { + speaker = 'You'; + } else { + // Use Deepgram diarization speaker ID for system audio channel + const words = data.channel.alternatives[0].words; + const speakerId = words?.[0]?.speaker; + speaker = speakerId != null ? `Speaker ${speakerId}` : 'System audio'; + } if (data.is_final) { - // Clear interim for this channel interimRef.current.delete(channelIndex); - // Merge with last entry if same speaker const entries = transcriptRef.current; if (entries.length > 0 && entries[entries.length - 1].speaker === speaker) { entries[entries.length - 1].text += ' ' + transcript; @@ -180,7 +218,6 @@ export function useMeetingTranscription() { entries.push({ speaker, text: transcript }); } } else { - // Update interim text for this channel interimRef.current.set(channelIndex, { speaker, text: transcript }); } scheduleDebouncedWrite(); @@ -194,7 +231,13 @@ export function useMeetingTranscription() { // Get mic stream let micStream: MediaStream; try { - micStream = await navigator.mediaDevices.getUserMedia({ audio: true }); + micStream = await navigator.mediaDevices.getUserMedia({ + audio: { + echoCancellation: true, + noiseSuppression: true, + autoGainControl: true, + }, + }); } catch (err) { console.error('[meeting] Microphone access denied:', err); cleanup(); @@ -203,12 +246,10 @@ export function useMeetingTranscription() { } micStreamRef.current = micStream; - // Get system audio via getDisplayMedia - // The main process setDisplayMediaRequestHandler auto-approves with loopback audio + // Get system audio via getDisplayMedia (loopback) let systemStream: MediaStream; try { systemStream = await navigator.mediaDevices.getDisplayMedia({ audio: true, video: true }); - // Stop any video tracks — we only need audio systemStream.getVideoTracks().forEach(t => t.stop()); } catch (err) { console.error('[meeting] System audio access denied:', err); @@ -226,7 +267,7 @@ export function useMeetingTranscription() { console.log('[meeting] System audio captured'); systemStreamRef.current = systemStream; - // Set up AudioContext with channel merger + // ----- Audio pipeline ----- const audioCtx = new AudioContext({ sampleRate: 16000 }); audioCtxRef.current = audioCtx; @@ -242,13 +283,35 @@ export function useMeetingTranscription() { processor.onaudioprocess = (e) => { if (!wsRef.current || wsRef.current.readyState !== WebSocket.OPEN) return; - const ch0 = e.inputBuffer.getChannelData(0); - const ch1 = e.inputBuffer.getChannelData(1); - // Interleave 2 channels into stereo int16 PCM - const int16 = new Int16Array(ch0.length * 2); - for (let i = 0; i < ch0.length; i++) { - const s0 = Math.max(-1, Math.min(1, ch0[i])); - const s1 = Math.max(-1, Math.min(1, ch1[i])); + + const micRaw = e.inputBuffer.getChannelData(0); + const sysRaw = e.inputBuffer.getChannelData(1); + + // Mode 1 (headphones): pass both streams through unmodified + // Mode 2 (speakers): gate/mute mic when system audio is active + let micOut: Float32Array; + if (usingHeadphones) { + micOut = micRaw; + } else { + // Compute system audio RMS to detect activity + let sysSum = 0; + for (let i = 0; i < sysRaw.length; i++) sysSum += sysRaw[i] * sysRaw[i]; + const sysRms = Math.sqrt(sysSum / sysRaw.length); + + if (sysRms > SYSTEM_AUDIO_GATE_THRESHOLD) { + // System audio is playing — mute mic to prevent bleed + micOut = new Float32Array(micRaw.length); // all zeros + } else { + // System audio is silent — pass mic through + micOut = micRaw; + } + } + + // Interleave mic (ch0) + system audio (ch1) into stereo int16 PCM + const int16 = new Int16Array(micOut.length * 2); + for (let i = 0; i < micOut.length; i++) { + const s0 = Math.max(-1, Math.min(1, micOut[i])); + const s1 = Math.max(-1, Math.min(1, sysRaw[i])); int16[i * 2] = s0 < 0 ? s0 * 0x8000 : s0 * 0x7fff; int16[i * 2 + 1] = s1 < 0 ? s1 * 0x8000 : s1 * 0x7fff; } @@ -282,11 +345,7 @@ export function useMeetingTranscription() { setState('stopping'); cleanup(); - - // Clear interims so final write only has finalized text interimRef.current = new Map(); - - // Write final transcript await writeTranscriptToFile(); setState('idle');