feat(voice): audio-reactive waveform while recording (no live transcript) (#634)

* feat(voice): show audio-reactive waveform instead of live transcript When recording, the chat input now displays only a live waveform that accumulates from the left and grows to full width, with bar heights driven by real mic amplitude. The transcribed words are still captured and submitted, just not shown while recording. New bars animate in and flow smoothly at ~16 updates/sec. * feat(voice): auto-gain waveform bar heights to track voice dynamics Normalize each frame's amplitude against a running peak (instant attack, slow release) at capture time and map it with a near-linear curve, so bar heights accurately reflect how loud/soft the voice is regardless of mic gain — replacing the old fixed-gain sqrt curve that saturated near max.
2026-06-24 20:28:16 +02:00 · 2026-06-22 14:10:56 -07:00 · 2026-06-22 14:10:56 -07:00 · a12bf4837b
commit a12bf4837b
parent de7d6b7a10
4 changed files with 130 additions and 16 deletions
--- a/apps/x/apps/renderer/src/App.tsx
+++ b/apps/x/apps/renderer/src/App.tsx
@ -6193,6 +6193,7 @@ function App() {
                            isRecording={isActive && isRecording}
                            recordingText={isActive ? voice.interimText : undefined}
                            recordingState={isActive ? (voice.state === 'connecting' ? 'connecting' : 'listening') : undefined}
+                            audioLevelsRef={voice.audioLevelsRef}
                            onStartRecording={isActive ? handleStartRecording : undefined}
                            onSubmitRecording={isActive ? handleSubmitRecording : undefined}
                            onCancelRecording={isActive ? handleCancelRecording : undefined}
@ -6301,6 +6302,7 @@ function App() {
                isRecording={isRecording}
                recordingText={voice.interimText}
                recordingState={voice.state === 'connecting' ? 'connecting' : 'listening'}
+                audioLevelsRef={voice.audioLevelsRef}
                onStartRecording={handleStartRecording}
                onSubmitRecording={handleSubmitRecording}
                onCancelRecording={handleCancelRecording}
--- a/apps/x/apps/renderer/src/components/chat-input-with-mentions.tsx
+++ b/apps/x/apps/renderer/src/components/chat-input-with-mentions.tsx
@ -224,6 +224,8 @@ interface ChatInputInnerProps {
  isRecording?: boolean
  recordingText?: string
  recordingState?: 'connecting' | 'listening'
+  /** Live mic amplitude history (RMS per frame) driving the recording waveform. */
+  audioLevelsRef?: React.MutableRefObject<number[]>
  onStartRecording?: () => void
  onSubmitRecording?: () => void
  onCancelRecording?: () => void
@ -260,7 +262,7 @@ function ChatInputInner({
  onDraftChange,
  isRecording,
  recordingText,
-  recordingState,
+  audioLevelsRef,
  onStartRecording,
  onSubmitRecording,
  onCancelRecording,
@ -795,11 +797,10 @@ function ChatInputInner({
          >
            <X className="h-4 w-4" />
          </button>
-          <div className="flex flex-1 items-center gap-2 overflow-hidden">
-            <VoiceWaveform />
-            <span className="min-w-0 flex-1 truncate text-sm text-muted-foreground">
-              {recordingState === 'connecting' ? 'Connecting...' : recordingText || 'Listening...'}
-            </span>
+          {/* Audio-reactive waveform only — the transcribed words are intentionally
+              not shown while recording; they're still captured and submitted. */}
+          <div className="flex flex-1 items-center overflow-hidden">
+            <VoiceWaveform audioLevelsRef={audioLevelsRef} />
          </div>
          <Button
            size="icon"
@ -1339,22 +1340,89 @@ function ChatInputInner({
 }

 /** Animated waveform bars for the recording indicator */
-function VoiceWaveform() {
+// Live recording waveform. Each bar is one captured audio frame; bars accumulate
+// from the left and grow rightward until they fill the width, then scroll (oldest
+// drops off the left). Bar height tracks that frame's mic amplitude, so the
+// waveform visibly reacts to how loud the user is speaking.
+const WAVE_BAR_WIDTH = 3 // px
+const WAVE_BAR_GAP = 2 // px
+const WAVE_BAR_PITCH = WAVE_BAR_WIDTH + WAVE_BAR_GAP
+const WAVE_BAR_MIN = 1.5 // px — floor so silence still shows a faint line
+const WAVE_BAR_MAX = 18 // px — fits inside the h-5 (20px) row
+const WAVE_CURVE = 0.8 // <1 lifts quiet speech slightly; near-linear keeps loud peaks tall
+
+function waveBarHeight(level: number): number {
+  // `level` is already auto-gained to ~0..1 in the hook, so map it close to linearly
+  // (a gentle curve) — louder voice ⇒ visibly taller bar, quiet ⇒ short.
+  const amp = Math.min(1, Math.max(0, level)) ** WAVE_CURVE
+  return WAVE_BAR_MIN + amp * (WAVE_BAR_MAX - WAVE_BAR_MIN)
+}
+
+function VoiceWaveform({ audioLevelsRef }: { audioLevelsRef?: React.MutableRefObject<number[]> }) {
+  const containerRef = useRef<HTMLDivElement>(null)
+  const [bars, setBars] = useState<number[]>([])
+  // How many bars fit in the current width; recomputed on resize.
+  const maxBarsRef = useRef(48)
+
+  useEffect(() => {
+    const el = containerRef.current
+    if (!el) return
+    const measure = () => {
+      maxBarsRef.current = Math.max(1, Math.floor(el.clientWidth / WAVE_BAR_PITCH))
+    }
+    measure()
+    const ro = new ResizeObserver(measure)
+    ro.observe(el)
+    return () => ro.disconnect()
+  }, [])
+
+  useEffect(() => {
+    if (!audioLevelsRef) return
+    let raf = 0
+    let lastSig = ''
+    const tick = () => {
+      const levels = audioLevelsRef.current
+      const maxBars = maxBarsRef.current
+      const next = levels.length > maxBars ? levels.slice(levels.length - maxBars) : levels
+      // Only re-render when the visible window actually changed. Length covers
+      // the growth phase; the trailing value covers the scrolling phase once full.
+      const sig = `${next.length}:${next.length ? next[next.length - 1] : 0}`
+      if (sig !== lastSig) {
+        lastSig = sig
+        setBars(next.slice())
+      }
+      raf = requestAnimationFrame(tick)
+    }
+    raf = requestAnimationFrame(tick)
+    return () => cancelAnimationFrame(raf)
+  }, [audioLevelsRef])
+
  return (
-    <div className="flex items-center gap-[3px] h-5">
-      {[0, 1, 2, 3, 4].map((i) => (
+    <div
+      ref={containerRef}
+      className="flex h-5 w-full items-center overflow-hidden"
+      style={{ gap: `${WAVE_BAR_GAP}px` }}
+    >
+      {/* Each newly-appended bar mounts with `voice-bar-in` (grows + fades in) so it
+          doesn't pop. Once the strip is full and values scroll through the bars, the
+          height transition makes them flow smoothly instead of stepping. */}
+      {bars.map((level, i) => (
        <span
          key={i}
-          className="w-[3px] rounded-full bg-primary"
+          className="shrink-0 rounded-full bg-primary"
          style={{
-            animation: `voice-wave 1.2s ease-in-out ${i * 0.15}s infinite`,
+            width: `${WAVE_BAR_WIDTH}px`,
+            height: `${waveBarHeight(level)}px`,
+            transformOrigin: 'center',
+            transition: 'height 90ms linear',
+            animation: 'voice-bar-in 130ms ease-out',
          }}
        />
      ))}
      <style>{`
-        @keyframes voice-wave {
-          0%, 100% { height: 4px; }
-          50% { height: 16px; }
+        @keyframes voice-bar-in {
+          from { transform: scaleY(0.15); opacity: 0; }
+          to { transform: scaleY(1); opacity: 1; }
        }
      `}</style>
    </div>
@ -1378,6 +1446,7 @@ export interface ChatInputWithMentionsProps {
  isRecording?: boolean
  recordingText?: string
  recordingState?: 'connecting' | 'listening'
+  audioLevelsRef?: React.MutableRefObject<number[]>
  onStartRecording?: () => void
  onSubmitRecording?: () => void
  onCancelRecording?: () => void
@ -1411,6 +1480,7 @@ export function ChatInputWithMentions({
  isRecording,
  recordingText,
  recordingState,
+  audioLevelsRef,
  onStartRecording,
  onSubmitRecording,
  onCancelRecording,
@ -1441,6 +1511,7 @@ export function ChatInputWithMentions({
        isRecording={isRecording}
        recordingText={recordingText}
        recordingState={recordingState}
+        audioLevelsRef={audioLevelsRef}
        onStartRecording={onStartRecording}
        onSubmitRecording={onSubmitRecording}
        onCancelRecording={onCancelRecording}
--- a/apps/x/apps/renderer/src/components/chat-sidebar.tsx
+++ b/apps/x/apps/renderer/src/components/chat-sidebar.tsx
@ -178,6 +178,7 @@ interface ChatSidebarProps {
  isRecording?: boolean
  recordingText?: string
  recordingState?: 'connecting' | 'listening'
+  audioLevelsRef?: React.MutableRefObject<number[]>
  onStartRecording?: () => void
  onSubmitRecording?: () => void
  onCancelRecording?: () => void
@ -240,6 +241,7 @@ export function ChatSidebar({
  isRecording,
  recordingText,
  recordingState,
+  audioLevelsRef,
  onStartRecording,
  onSubmitRecording,
  onCancelRecording,
@ -811,6 +813,7 @@ export function ChatSidebar({
                          isRecording={isActive && isRecording}
                          recordingText={isActive ? recordingText : undefined}
                          recordingState={isActive ? recordingState : undefined}
+                          audioLevelsRef={audioLevelsRef}
                          onStartRecording={isActive ? onStartRecording : undefined}
                          onSubmitRecording={isActive ? onSubmitRecording : undefined}
                          onCancelRecording={isActive ? onCancelRecording : undefined}
--- a/apps/x/apps/renderer/src/hooks/useVoiceMode.ts
+++ b/apps/x/apps/renderer/src/hooks/useVoiceMode.ts
@ -20,6 +20,17 @@ const DEEPGRAM_PARAMS = new URLSearchParams({
 });
 const DEEPGRAM_LISTEN_URL = `wss://api.deepgram.com/v1/listen?${DEEPGRAM_PARAMS.toString()}`;

+// Cap on retained per-frame amplitude samples (~64ms/frame ⇒ ~5 min of history).
+// The waveform only ever displays the most recent window, so older samples are dropped.
+const MAX_AUDIO_LEVELS = 4800;
+
+// Auto-gain for the waveform: each frame's amplitude is stored normalized against a
+// running peak (instant attack, slow release) so bar heights track the *relative*
+// loudness of the voice accurately regardless of mic/OS input gain. MIN_PEAK is a
+// floor so near-silence doesn't get amplified up into tall bars.
+const PEAK_DECAY = 0.97;
+const MIN_PEAK = 0.02;
+
 // Cache auth details so we don't need IPC round-trips on every mic click
 let cachedAuth: { type: 'rowboat'; url: string; token: string } | { type: 'local'; apiKey: string } | null = null;

@ -35,6 +46,12 @@ export function useVoiceMode() {
    const interimRef = useRef('');
    // Buffer audio chunks captured before the WebSocket is ready
    const audioBufferRef = useRef<ArrayBuffer[]>([]);
+    // Rolling history of per-frame mic amplitude (auto-gained to 0..1), oldest first.
+    // Drives the live waveform — the UI reads this via requestAnimationFrame so
+    // amplitude updates never re-render the rest of the tree.
+    const audioLevelsRef = useRef<number[]>([]);
+    // Running peak amplitude for the waveform auto-gain (see PEAK_DECAY/MIN_PEAK).
+    const audioPeakRef = useRef(0);

    // Refresh cached auth details (called on warmup, not on mic click)
    const refreshAuth = useCallback(async () => {
@ -132,6 +149,8 @@ export function useVoiceMode() {
            wsRef.current = null;
        }
        audioBufferRef.current = [];
+        audioLevelsRef.current = [];
+        audioPeakRef.current = 0;
        setInterimText('');
        transcriptBufferRef.current = '';
        interimRef.current = '';
@ -145,6 +164,8 @@ export function useVoiceMode() {
        interimRef.current = '';
        setInterimText('');
        audioBufferRef.current = [];
+        audioLevelsRef.current = [];
+        audioPeakRef.current = 0;

        // Show listening immediately — don't wait for WebSocket
        setState('listening');
@ -188,15 +209,32 @@ export function useVoiceMode() {
        const audioCtx = new AudioContext({ sampleRate: 16000 });
        audioCtxRef.current = audioCtx;
        const source = audioCtx.createMediaStreamSource(stream);
-        const processor = audioCtx.createScriptProcessor(2048, 1, 1);
+        // 1024-sample frames (~64ms at 16kHz) — smaller than the usual 2048 so the
+        // waveform gets ~16 amplitude updates/sec, making bars appear faster and
+        // flow more smoothly. Still a comfortable chunk size for Deepgram streaming.
+        const processor = audioCtx.createScriptProcessor(1024, 1, 1);
        processorRef.current = processor;

        processor.onaudioprocess = (e) => {
            const float32 = e.inputBuffer.getChannelData(0);
            const int16 = new Int16Array(float32.length);
+            let sumSquares = 0;
            for (let i = 0; i < float32.length; i++) {
                const s = Math.max(-1, Math.min(1, float32[i]));
                int16[i] = s < 0 ? s * 0x8000 : s * 0x7fff;
+                sumSquares += s * s;
+            }
+            // Record this frame's loudness for the live waveform, auto-gained against
+            // a running peak so bar heights accurately reflect the voice's dynamics.
+            // Instant attack (a louder frame raises the peak immediately), slow
+            // release (PEAK_DECAY), floored at MIN_PEAK so silence stays flat.
+            const rms = Math.sqrt(sumSquares / float32.length);
+            const peak = Math.max(rms, audioPeakRef.current * PEAK_DECAY, MIN_PEAK);
+            audioPeakRef.current = peak;
+            const levels = audioLevelsRef.current;
+            levels.push(rms / peak);
+            if (levels.length > MAX_AUDIO_LEVELS) {
+                levels.splice(0, levels.length - MAX_AUDIO_LEVELS);
            }
            const buffer = int16.buffer;
            if (wsRef.current?.readyState === WebSocket.OPEN) {
@ -232,5 +270,5 @@ export function useVoiceMode() {
        refreshAuth().catch(() => {});
    }, [refreshAuth]);

-    return { state, interimText, start, submit, cancel, warmup };
+    return { state, interimText, audioLevelsRef, start, submit, cancel, warmup };
 }