feat(voice): audio-reactive waveform while recording (no live transcript) (#634)

* feat(voice): show audio-reactive waveform instead of live transcript

When recording, the chat input now displays only a live waveform that
accumulates from the left and grows to full width, with bar heights
driven by real mic amplitude. The transcribed words are still captured
and submitted, just not shown while recording. New bars animate in and
flow smoothly at ~16 updates/sec.

* feat(voice): auto-gain waveform bar heights to track voice dynamics

Normalize each frame's amplitude against a running peak (instant attack,
slow release) at capture time and map it with a near-linear curve, so bar
heights accurately reflect how loud/soft the voice is regardless of mic
gain — replacing the old fixed-gain sqrt curve that saturated near max.
This commit is contained in:
gagan 2026-06-22 14:10:56 -07:00 committed by GitHub
parent de7d6b7a10
commit a12bf4837b
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
4 changed files with 130 additions and 16 deletions

View file

@ -6193,6 +6193,7 @@ function App() {
isRecording={isActive && isRecording}
recordingText={isActive ? voice.interimText : undefined}
recordingState={isActive ? (voice.state === 'connecting' ? 'connecting' : 'listening') : undefined}
audioLevelsRef={voice.audioLevelsRef}
onStartRecording={isActive ? handleStartRecording : undefined}
onSubmitRecording={isActive ? handleSubmitRecording : undefined}
onCancelRecording={isActive ? handleCancelRecording : undefined}
@ -6301,6 +6302,7 @@ function App() {
isRecording={isRecording}
recordingText={voice.interimText}
recordingState={voice.state === 'connecting' ? 'connecting' : 'listening'}
audioLevelsRef={voice.audioLevelsRef}
onStartRecording={handleStartRecording}
onSubmitRecording={handleSubmitRecording}
onCancelRecording={handleCancelRecording}

View file

@ -224,6 +224,8 @@ interface ChatInputInnerProps {
isRecording?: boolean
recordingText?: string
recordingState?: 'connecting' | 'listening'
/** Live mic amplitude history (RMS per frame) driving the recording waveform. */
audioLevelsRef?: React.MutableRefObject<number[]>
onStartRecording?: () => void
onSubmitRecording?: () => void
onCancelRecording?: () => void
@ -260,7 +262,7 @@ function ChatInputInner({
onDraftChange,
isRecording,
recordingText,
recordingState,
audioLevelsRef,
onStartRecording,
onSubmitRecording,
onCancelRecording,
@ -795,11 +797,10 @@ function ChatInputInner({
>
<X className="h-4 w-4" />
</button>
<div className="flex flex-1 items-center gap-2 overflow-hidden">
<VoiceWaveform />
<span className="min-w-0 flex-1 truncate text-sm text-muted-foreground">
{recordingState === 'connecting' ? 'Connecting...' : recordingText || 'Listening...'}
</span>
{/* Audio-reactive waveform only the transcribed words are intentionally
not shown while recording; they're still captured and submitted. */}
<div className="flex flex-1 items-center overflow-hidden">
<VoiceWaveform audioLevelsRef={audioLevelsRef} />
</div>
<Button
size="icon"
@ -1339,22 +1340,89 @@ function ChatInputInner({
}
/** Animated waveform bars for the recording indicator */
function VoiceWaveform() {
// Live recording waveform. Each bar is one captured audio frame; bars accumulate
// from the left and grow rightward until they fill the width, then scroll (oldest
// drops off the left). Bar height tracks that frame's mic amplitude, so the
// waveform visibly reacts to how loud the user is speaking.
const WAVE_BAR_WIDTH = 3 // px
const WAVE_BAR_GAP = 2 // px
const WAVE_BAR_PITCH = WAVE_BAR_WIDTH + WAVE_BAR_GAP
const WAVE_BAR_MIN = 1.5 // px — floor so silence still shows a faint line
const WAVE_BAR_MAX = 18 // px — fits inside the h-5 (20px) row
const WAVE_CURVE = 0.8 // <1 lifts quiet speech slightly; near-linear keeps loud peaks tall
function waveBarHeight(level: number): number {
// `level` is already auto-gained to ~0..1 in the hook, so map it close to linearly
// (a gentle curve) — louder voice ⇒ visibly taller bar, quiet ⇒ short.
const amp = Math.min(1, Math.max(0, level)) ** WAVE_CURVE
return WAVE_BAR_MIN + amp * (WAVE_BAR_MAX - WAVE_BAR_MIN)
}
function VoiceWaveform({ audioLevelsRef }: { audioLevelsRef?: React.MutableRefObject<number[]> }) {
const containerRef = useRef<HTMLDivElement>(null)
const [bars, setBars] = useState<number[]>([])
// How many bars fit in the current width; recomputed on resize.
const maxBarsRef = useRef(48)
useEffect(() => {
const el = containerRef.current
if (!el) return
const measure = () => {
maxBarsRef.current = Math.max(1, Math.floor(el.clientWidth / WAVE_BAR_PITCH))
}
measure()
const ro = new ResizeObserver(measure)
ro.observe(el)
return () => ro.disconnect()
}, [])
useEffect(() => {
if (!audioLevelsRef) return
let raf = 0
let lastSig = ''
const tick = () => {
const levels = audioLevelsRef.current
const maxBars = maxBarsRef.current
const next = levels.length > maxBars ? levels.slice(levels.length - maxBars) : levels
// Only re-render when the visible window actually changed. Length covers
// the growth phase; the trailing value covers the scrolling phase once full.
const sig = `${next.length}:${next.length ? next[next.length - 1] : 0}`
if (sig !== lastSig) {
lastSig = sig
setBars(next.slice())
}
raf = requestAnimationFrame(tick)
}
raf = requestAnimationFrame(tick)
return () => cancelAnimationFrame(raf)
}, [audioLevelsRef])
return (
<div className="flex items-center gap-[3px] h-5">
{[0, 1, 2, 3, 4].map((i) => (
<div
ref={containerRef}
className="flex h-5 w-full items-center overflow-hidden"
style={{ gap: `${WAVE_BAR_GAP}px` }}
>
{/* Each newly-appended bar mounts with `voice-bar-in` (grows + fades in) so it
doesn't pop. Once the strip is full and values scroll through the bars, the
height transition makes them flow smoothly instead of stepping. */}
{bars.map((level, i) => (
<span
key={i}
className="w-[3px] rounded-full bg-primary"
className="shrink-0 rounded-full bg-primary"
style={{
animation: `voice-wave 1.2s ease-in-out ${i * 0.15}s infinite`,
width: `${WAVE_BAR_WIDTH}px`,
height: `${waveBarHeight(level)}px`,
transformOrigin: 'center',
transition: 'height 90ms linear',
animation: 'voice-bar-in 130ms ease-out',
}}
/>
))}
<style>{`
@keyframes voice-wave {
0%, 100% { height: 4px; }
50% { height: 16px; }
@keyframes voice-bar-in {
from { transform: scaleY(0.15); opacity: 0; }
to { transform: scaleY(1); opacity: 1; }
}
`}</style>
</div>
@ -1378,6 +1446,7 @@ export interface ChatInputWithMentionsProps {
isRecording?: boolean
recordingText?: string
recordingState?: 'connecting' | 'listening'
audioLevelsRef?: React.MutableRefObject<number[]>
onStartRecording?: () => void
onSubmitRecording?: () => void
onCancelRecording?: () => void
@ -1411,6 +1480,7 @@ export function ChatInputWithMentions({
isRecording,
recordingText,
recordingState,
audioLevelsRef,
onStartRecording,
onSubmitRecording,
onCancelRecording,
@ -1441,6 +1511,7 @@ export function ChatInputWithMentions({
isRecording={isRecording}
recordingText={recordingText}
recordingState={recordingState}
audioLevelsRef={audioLevelsRef}
onStartRecording={onStartRecording}
onSubmitRecording={onSubmitRecording}
onCancelRecording={onCancelRecording}

View file

@ -178,6 +178,7 @@ interface ChatSidebarProps {
isRecording?: boolean
recordingText?: string
recordingState?: 'connecting' | 'listening'
audioLevelsRef?: React.MutableRefObject<number[]>
onStartRecording?: () => void
onSubmitRecording?: () => void
onCancelRecording?: () => void
@ -240,6 +241,7 @@ export function ChatSidebar({
isRecording,
recordingText,
recordingState,
audioLevelsRef,
onStartRecording,
onSubmitRecording,
onCancelRecording,
@ -811,6 +813,7 @@ export function ChatSidebar({
isRecording={isActive && isRecording}
recordingText={isActive ? recordingText : undefined}
recordingState={isActive ? recordingState : undefined}
audioLevelsRef={audioLevelsRef}
onStartRecording={isActive ? onStartRecording : undefined}
onSubmitRecording={isActive ? onSubmitRecording : undefined}
onCancelRecording={isActive ? onCancelRecording : undefined}

View file

@ -20,6 +20,17 @@ const DEEPGRAM_PARAMS = new URLSearchParams({
});
const DEEPGRAM_LISTEN_URL = `wss://api.deepgram.com/v1/listen?${DEEPGRAM_PARAMS.toString()}`;
// Cap on retained per-frame amplitude samples (~64ms/frame ⇒ ~5 min of history).
// The waveform only ever displays the most recent window, so older samples are dropped.
const MAX_AUDIO_LEVELS = 4800;
// Auto-gain for the waveform: each frame's amplitude is stored normalized against a
// running peak (instant attack, slow release) so bar heights track the *relative*
// loudness of the voice accurately regardless of mic/OS input gain. MIN_PEAK is a
// floor so near-silence doesn't get amplified up into tall bars.
const PEAK_DECAY = 0.97;
const MIN_PEAK = 0.02;
// Cache auth details so we don't need IPC round-trips on every mic click
let cachedAuth: { type: 'rowboat'; url: string; token: string } | { type: 'local'; apiKey: string } | null = null;
@ -35,6 +46,12 @@ export function useVoiceMode() {
const interimRef = useRef('');
// Buffer audio chunks captured before the WebSocket is ready
const audioBufferRef = useRef<ArrayBuffer[]>([]);
// Rolling history of per-frame mic amplitude (auto-gained to 0..1), oldest first.
// Drives the live waveform — the UI reads this via requestAnimationFrame so
// amplitude updates never re-render the rest of the tree.
const audioLevelsRef = useRef<number[]>([]);
// Running peak amplitude for the waveform auto-gain (see PEAK_DECAY/MIN_PEAK).
const audioPeakRef = useRef(0);
// Refresh cached auth details (called on warmup, not on mic click)
const refreshAuth = useCallback(async () => {
@ -132,6 +149,8 @@ export function useVoiceMode() {
wsRef.current = null;
}
audioBufferRef.current = [];
audioLevelsRef.current = [];
audioPeakRef.current = 0;
setInterimText('');
transcriptBufferRef.current = '';
interimRef.current = '';
@ -145,6 +164,8 @@ export function useVoiceMode() {
interimRef.current = '';
setInterimText('');
audioBufferRef.current = [];
audioLevelsRef.current = [];
audioPeakRef.current = 0;
// Show listening immediately — don't wait for WebSocket
setState('listening');
@ -188,15 +209,32 @@ export function useVoiceMode() {
const audioCtx = new AudioContext({ sampleRate: 16000 });
audioCtxRef.current = audioCtx;
const source = audioCtx.createMediaStreamSource(stream);
const processor = audioCtx.createScriptProcessor(2048, 1, 1);
// 1024-sample frames (~64ms at 16kHz) — smaller than the usual 2048 so the
// waveform gets ~16 amplitude updates/sec, making bars appear faster and
// flow more smoothly. Still a comfortable chunk size for Deepgram streaming.
const processor = audioCtx.createScriptProcessor(1024, 1, 1);
processorRef.current = processor;
processor.onaudioprocess = (e) => {
const float32 = e.inputBuffer.getChannelData(0);
const int16 = new Int16Array(float32.length);
let sumSquares = 0;
for (let i = 0; i < float32.length; i++) {
const s = Math.max(-1, Math.min(1, float32[i]));
int16[i] = s < 0 ? s * 0x8000 : s * 0x7fff;
sumSquares += s * s;
}
// Record this frame's loudness for the live waveform, auto-gained against
// a running peak so bar heights accurately reflect the voice's dynamics.
// Instant attack (a louder frame raises the peak immediately), slow
// release (PEAK_DECAY), floored at MIN_PEAK so silence stays flat.
const rms = Math.sqrt(sumSquares / float32.length);
const peak = Math.max(rms, audioPeakRef.current * PEAK_DECAY, MIN_PEAK);
audioPeakRef.current = peak;
const levels = audioLevelsRef.current;
levels.push(rms / peak);
if (levels.length > MAX_AUDIO_LEVELS) {
levels.splice(0, levels.length - MAX_AUDIO_LEVELS);
}
const buffer = int16.buffer;
if (wsRef.current?.readyState === WebSocket.OPEN) {
@ -232,5 +270,5 @@ export function useVoiceMode() {
refreshAuth().catch(() => {});
}, [refreshAuth]);
return { state, interimText, start, submit, cancel, warmup };
return { state, interimText, audioLevelsRef, start, submit, cancel, warmup };
}