mirror of
https://github.com/rowboatlabs/rowboat.git
synced 2026-05-19 18:35:18 +02:00
diarization and mic muting
This commit is contained in:
parent
840ba560f2
commit
537ca08fe5
1 changed files with 81 additions and 22 deletions
|
|
@ -8,12 +8,36 @@ const DEEPGRAM_PARAMS = new URLSearchParams({
|
||||||
sample_rate: '16000',
|
sample_rate: '16000',
|
||||||
channels: '2',
|
channels: '2',
|
||||||
multichannel: 'true',
|
multichannel: 'true',
|
||||||
|
diarize: 'true',
|
||||||
interim_results: 'true',
|
interim_results: 'true',
|
||||||
smart_format: 'true',
|
smart_format: 'true',
|
||||||
punctuate: 'true',
|
punctuate: 'true',
|
||||||
});
|
});
|
||||||
const DEEPGRAM_LISTEN_URL = `wss://api.deepgram.com/v1/listen?${DEEPGRAM_PARAMS.toString()}`;
|
const DEEPGRAM_LISTEN_URL = `wss://api.deepgram.com/v1/listen?${DEEPGRAM_PARAMS.toString()}`;
|
||||||
|
|
||||||
|
// RMS threshold: system audio above this = "active" (speakers playing)
|
||||||
|
const SYSTEM_AUDIO_GATE_THRESHOLD = 0.005;
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// Headphone detection
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
async function detectHeadphones(): Promise<boolean> {
|
||||||
|
try {
|
||||||
|
const devices = await navigator.mediaDevices.enumerateDevices();
|
||||||
|
const outputs = devices.filter(d => d.kind === 'audiooutput');
|
||||||
|
const defaultOutput = outputs.find(d => d.deviceId === 'default');
|
||||||
|
const label = (defaultOutput?.label ?? '').toLowerCase();
|
||||||
|
// Heuristic: built-in speakers won't match these patterns
|
||||||
|
const headphonePatterns = ['headphone', 'airpod', 'earpod', 'earphone', 'earbud', 'bluetooth', 'bt_', 'jabra', 'bose', 'sony wh', 'sony wf'];
|
||||||
|
return headphonePatterns.some(p => label.includes(p));
|
||||||
|
} catch {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// Transcript formatting
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
interface TranscriptEntry {
|
interface TranscriptEntry {
|
||||||
speaker: string;
|
speaker: string;
|
||||||
text: string;
|
text: string;
|
||||||
|
|
@ -32,7 +56,6 @@ function formatTranscript(entries: TranscriptEntry[], date: string): string {
|
||||||
'',
|
'',
|
||||||
];
|
];
|
||||||
for (let i = 0; i < entries.length; i++) {
|
for (let i = 0; i < entries.length; i++) {
|
||||||
// Add extra blank line between different speakers
|
|
||||||
if (i > 0 && entries[i].speaker !== entries[i - 1].speaker) {
|
if (i > 0 && entries[i].speaker !== entries[i - 1].speaker) {
|
||||||
lines.push('');
|
lines.push('');
|
||||||
}
|
}
|
||||||
|
|
@ -42,6 +65,9 @@ function formatTranscript(entries: TranscriptEntry[], date: string): string {
|
||||||
return lines.join('\n');
|
return lines.join('\n');
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// Hook
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
export function useMeetingTranscription() {
|
export function useMeetingTranscription() {
|
||||||
const [state, setState] = useState<MeetingTranscriptionState>('idle');
|
const [state, setState] = useState<MeetingTranscriptionState>('idle');
|
||||||
const wsRef = useRef<WebSocket | null>(null);
|
const wsRef = useRef<WebSocket | null>(null);
|
||||||
|
|
@ -57,7 +83,6 @@ export function useMeetingTranscription() {
|
||||||
|
|
||||||
const writeTranscriptToFile = useCallback(async () => {
|
const writeTranscriptToFile = useCallback(async () => {
|
||||||
if (!notePathRef.current) return;
|
if (!notePathRef.current) return;
|
||||||
// Combine finalized entries with any in-progress interim text
|
|
||||||
const entries = [...transcriptRef.current];
|
const entries = [...transcriptRef.current];
|
||||||
for (const interim of interimRef.current.values()) {
|
for (const interim of interimRef.current.values()) {
|
||||||
if (!interim.text) continue;
|
if (!interim.text) continue;
|
||||||
|
|
@ -119,6 +144,10 @@ export function useMeetingTranscription() {
|
||||||
if (state !== 'idle') return null;
|
if (state !== 'idle') return null;
|
||||||
setState('connecting');
|
setState('connecting');
|
||||||
|
|
||||||
|
// Detect headphones vs speakers
|
||||||
|
const usingHeadphones = await detectHeadphones();
|
||||||
|
console.log(`[meeting] Audio output mode: ${usingHeadphones ? 'headphones' : 'speakers'}`);
|
||||||
|
|
||||||
// Get Deepgram token
|
// Get Deepgram token
|
||||||
let ws: WebSocket;
|
let ws: WebSocket;
|
||||||
try {
|
try {
|
||||||
|
|
@ -167,12 +196,21 @@ export function useMeetingTranscription() {
|
||||||
if (!transcript) return;
|
if (!transcript) return;
|
||||||
|
|
||||||
const channelIndex = data.channel_index?.[0] ?? 0;
|
const channelIndex = data.channel_index?.[0] ?? 0;
|
||||||
const speaker = channelIndex === 0 ? 'You' : 'System audio';
|
const isMic = channelIndex === 0;
|
||||||
|
|
||||||
|
// Channel 0 = mic = "You", Channel 1 = system audio with diarization
|
||||||
|
let speaker: string;
|
||||||
|
if (isMic) {
|
||||||
|
speaker = 'You';
|
||||||
|
} else {
|
||||||
|
// Use Deepgram diarization speaker ID for system audio channel
|
||||||
|
const words = data.channel.alternatives[0].words;
|
||||||
|
const speakerId = words?.[0]?.speaker;
|
||||||
|
speaker = speakerId != null ? `Speaker ${speakerId}` : 'System audio';
|
||||||
|
}
|
||||||
|
|
||||||
if (data.is_final) {
|
if (data.is_final) {
|
||||||
// Clear interim for this channel
|
|
||||||
interimRef.current.delete(channelIndex);
|
interimRef.current.delete(channelIndex);
|
||||||
// Merge with last entry if same speaker
|
|
||||||
const entries = transcriptRef.current;
|
const entries = transcriptRef.current;
|
||||||
if (entries.length > 0 && entries[entries.length - 1].speaker === speaker) {
|
if (entries.length > 0 && entries[entries.length - 1].speaker === speaker) {
|
||||||
entries[entries.length - 1].text += ' ' + transcript;
|
entries[entries.length - 1].text += ' ' + transcript;
|
||||||
|
|
@ -180,7 +218,6 @@ export function useMeetingTranscription() {
|
||||||
entries.push({ speaker, text: transcript });
|
entries.push({ speaker, text: transcript });
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
// Update interim text for this channel
|
|
||||||
interimRef.current.set(channelIndex, { speaker, text: transcript });
|
interimRef.current.set(channelIndex, { speaker, text: transcript });
|
||||||
}
|
}
|
||||||
scheduleDebouncedWrite();
|
scheduleDebouncedWrite();
|
||||||
|
|
@ -194,7 +231,13 @@ export function useMeetingTranscription() {
|
||||||
// Get mic stream
|
// Get mic stream
|
||||||
let micStream: MediaStream;
|
let micStream: MediaStream;
|
||||||
try {
|
try {
|
||||||
micStream = await navigator.mediaDevices.getUserMedia({ audio: true });
|
micStream = await navigator.mediaDevices.getUserMedia({
|
||||||
|
audio: {
|
||||||
|
echoCancellation: true,
|
||||||
|
noiseSuppression: true,
|
||||||
|
autoGainControl: true,
|
||||||
|
},
|
||||||
|
});
|
||||||
} catch (err) {
|
} catch (err) {
|
||||||
console.error('[meeting] Microphone access denied:', err);
|
console.error('[meeting] Microphone access denied:', err);
|
||||||
cleanup();
|
cleanup();
|
||||||
|
|
@ -203,12 +246,10 @@ export function useMeetingTranscription() {
|
||||||
}
|
}
|
||||||
micStreamRef.current = micStream;
|
micStreamRef.current = micStream;
|
||||||
|
|
||||||
// Get system audio via getDisplayMedia
|
// Get system audio via getDisplayMedia (loopback)
|
||||||
// The main process setDisplayMediaRequestHandler auto-approves with loopback audio
|
|
||||||
let systemStream: MediaStream;
|
let systemStream: MediaStream;
|
||||||
try {
|
try {
|
||||||
systemStream = await navigator.mediaDevices.getDisplayMedia({ audio: true, video: true });
|
systemStream = await navigator.mediaDevices.getDisplayMedia({ audio: true, video: true });
|
||||||
// Stop any video tracks — we only need audio
|
|
||||||
systemStream.getVideoTracks().forEach(t => t.stop());
|
systemStream.getVideoTracks().forEach(t => t.stop());
|
||||||
} catch (err) {
|
} catch (err) {
|
||||||
console.error('[meeting] System audio access denied:', err);
|
console.error('[meeting] System audio access denied:', err);
|
||||||
|
|
@ -226,7 +267,7 @@ export function useMeetingTranscription() {
|
||||||
console.log('[meeting] System audio captured');
|
console.log('[meeting] System audio captured');
|
||||||
systemStreamRef.current = systemStream;
|
systemStreamRef.current = systemStream;
|
||||||
|
|
||||||
// Set up AudioContext with channel merger
|
// ----- Audio pipeline -----
|
||||||
const audioCtx = new AudioContext({ sampleRate: 16000 });
|
const audioCtx = new AudioContext({ sampleRate: 16000 });
|
||||||
audioCtxRef.current = audioCtx;
|
audioCtxRef.current = audioCtx;
|
||||||
|
|
||||||
|
|
@ -242,13 +283,35 @@ export function useMeetingTranscription() {
|
||||||
|
|
||||||
processor.onaudioprocess = (e) => {
|
processor.onaudioprocess = (e) => {
|
||||||
if (!wsRef.current || wsRef.current.readyState !== WebSocket.OPEN) return;
|
if (!wsRef.current || wsRef.current.readyState !== WebSocket.OPEN) return;
|
||||||
const ch0 = e.inputBuffer.getChannelData(0);
|
|
||||||
const ch1 = e.inputBuffer.getChannelData(1);
|
const micRaw = e.inputBuffer.getChannelData(0);
|
||||||
// Interleave 2 channels into stereo int16 PCM
|
const sysRaw = e.inputBuffer.getChannelData(1);
|
||||||
const int16 = new Int16Array(ch0.length * 2);
|
|
||||||
for (let i = 0; i < ch0.length; i++) {
|
// Mode 1 (headphones): pass both streams through unmodified
|
||||||
const s0 = Math.max(-1, Math.min(1, ch0[i]));
|
// Mode 2 (speakers): gate/mute mic when system audio is active
|
||||||
const s1 = Math.max(-1, Math.min(1, ch1[i]));
|
let micOut: Float32Array;
|
||||||
|
if (usingHeadphones) {
|
||||||
|
micOut = micRaw;
|
||||||
|
} else {
|
||||||
|
// Compute system audio RMS to detect activity
|
||||||
|
let sysSum = 0;
|
||||||
|
for (let i = 0; i < sysRaw.length; i++) sysSum += sysRaw[i] * sysRaw[i];
|
||||||
|
const sysRms = Math.sqrt(sysSum / sysRaw.length);
|
||||||
|
|
||||||
|
if (sysRms > SYSTEM_AUDIO_GATE_THRESHOLD) {
|
||||||
|
// System audio is playing — mute mic to prevent bleed
|
||||||
|
micOut = new Float32Array(micRaw.length); // all zeros
|
||||||
|
} else {
|
||||||
|
// System audio is silent — pass mic through
|
||||||
|
micOut = micRaw;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Interleave mic (ch0) + system audio (ch1) into stereo int16 PCM
|
||||||
|
const int16 = new Int16Array(micOut.length * 2);
|
||||||
|
for (let i = 0; i < micOut.length; i++) {
|
||||||
|
const s0 = Math.max(-1, Math.min(1, micOut[i]));
|
||||||
|
const s1 = Math.max(-1, Math.min(1, sysRaw[i]));
|
||||||
int16[i * 2] = s0 < 0 ? s0 * 0x8000 : s0 * 0x7fff;
|
int16[i * 2] = s0 < 0 ? s0 * 0x8000 : s0 * 0x7fff;
|
||||||
int16[i * 2 + 1] = s1 < 0 ? s1 * 0x8000 : s1 * 0x7fff;
|
int16[i * 2 + 1] = s1 < 0 ? s1 * 0x8000 : s1 * 0x7fff;
|
||||||
}
|
}
|
||||||
|
|
@ -282,11 +345,7 @@ export function useMeetingTranscription() {
|
||||||
setState('stopping');
|
setState('stopping');
|
||||||
|
|
||||||
cleanup();
|
cleanup();
|
||||||
|
|
||||||
// Clear interims so final write only has finalized text
|
|
||||||
interimRef.current = new Map();
|
interimRef.current = new Map();
|
||||||
|
|
||||||
// Write final transcript
|
|
||||||
await writeTranscriptToFile();
|
await writeTranscriptToFile();
|
||||||
|
|
||||||
setState('idle');
|
setState('idle');
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue