feat: persist split user and bot audio

2026-06-19 08:28:10 +02:00 · 2026-06-16 15:19:49 +05:30 · 2026-06-16 15:19:49 +05:30 · 3d1886c450
commit 3d1886c450
parent dd3f2e7323
30 changed files with 1322 additions and 253 deletions
--- a/api/services/configuration/registry.py
+++ b/api/services/configuration/registry.py
@ -944,6 +944,7 @@ class DograhTTSService(BaseTTSConfiguration):
    voice: str = Field(
        default="default",
        description="Voice preset.",
+        json_schema_extra={"allow_custom_input": True},
    )
    speed: float = Field(default=1.0, ge=0.5, le=2.0, description="Speed of the voice.")

--- a/api/services/pipecat/event_handlers.py
+++ b/api/services/pipecat/event_handlers.py
@ -9,8 +9,8 @@ from api.services.integrations import IntegrationRuntimeSession
 from api.services.pipecat.audio_config import AudioConfig
 from api.services.pipecat.audio_playback import play_audio_loop
 from api.services.pipecat.in_memory_buffers import (
-    InMemoryAudioBuffer,
    InMemoryLogsBuffer,
+    InMemoryRecordingBuffers,
 )
 from api.services.pipecat.pipeline_metrics_aggregator import PipelineMetricsAggregator
 from api.services.pipecat.tracing_config import get_trace_url
@ -40,11 +40,11 @@ async def _capture_call_event(
            "workflow_run_id": workflow_run_id,
            "workflow_id": workflow_run.workflow_id if workflow_run else None,
            "call_type": workflow_run.mode if workflow_run else None,
-            "call_direction": (workflow_run.initial_context or {}).get(
-                "direction", "outbound"
-            )
-            if workflow_run
-            else None,
+            "call_direction": (
+                (workflow_run.initial_context or {}).get("direction", "outbound")
+                if workflow_run
+                else None
+            ),
        }
        if extra_properties:
            properties.update(extra_properties)
@ -73,7 +73,7 @@ def register_event_handlers(
    """Register all event handlers for transport and task events.

    Returns:
-        in_memory_audio_buffer for use by other handlers.
+        In-memory recording buffers for use by other handlers.
    """
    # Initialize in-memory buffers with proper audio configuration
    sample_rate = audio_config.pipeline_sample_rate if audio_config else 16000
@ -84,7 +84,7 @@ def register_event_handlers(
        f"with sample_rate={sample_rate}Hz, channels={num_channels}"
    )

-    in_memory_audio_buffer = InMemoryAudioBuffer(
+    in_memory_audio_buffers = InMemoryRecordingBuffers(
        workflow_run_id=workflow_run_id,
        sample_rate=sample_rate,
        num_channels=num_channels,
@ -363,14 +363,32 @@ def register_event_handlers(

        # Write buffers to temp files and enqueue combined processing task
        audio_temp_path = None
+        user_audio_temp_path = None
+        bot_audio_temp_path = None
        transcript_temp_path = None

        try:
-            if not in_memory_audio_buffer.is_empty:
-                audio_temp_path = await in_memory_audio_buffer.write_to_temp_file()
+            if not in_memory_audio_buffers.mixed.is_empty:
+                audio_temp_path = (
+                    await in_memory_audio_buffers.mixed.write_to_temp_file()
+                )
            else:
                logger.debug("Audio buffer is empty, skipping upload")

+            if not in_memory_audio_buffers.user.is_empty:
+                user_audio_temp_path = (
+                    await in_memory_audio_buffers.user.write_to_temp_file()
+                )
+            else:
+                logger.debug("User audio buffer is empty, skipping upload")
+
+            if not in_memory_audio_buffers.bot.is_empty:
+                bot_audio_temp_path = (
+                    await in_memory_audio_buffers.bot.write_to_temp_file()
+                )
+            else:
+                logger.debug("Bot audio buffer is empty, skipping upload")
+
            transcript_temp_path = in_memory_logs_buffer.write_transcript_to_temp_file()
            if not transcript_temp_path:
                logger.debug("No transcript events in logs buffer, skipping upload")
@ -385,16 +403,18 @@ def register_event_handlers(
            workflow_run_id,
            audio_temp_path,
            transcript_temp_path,
+            user_audio_temp_path,
+            bot_audio_temp_path,
        )

    # Return the buffer so it can be passed to other handlers
-    return in_memory_audio_buffer
+    return in_memory_audio_buffers


 def register_audio_data_handler(
    audio_buffer: AudioBufferProcessor,
    workflow_run_id,
-    in_memory_buffer: InMemoryAudioBuffer,
+    in_memory_buffers: InMemoryRecordingBuffers,
 ):
    """Register event handler for audio data"""
    logger.info(f"Registering audio data handler for workflow run {workflow_run_id}")
@ -404,9 +424,19 @@ def register_audio_data_handler(
        if not audio:
            return

-        # Use in-memory buffer
        try:
-            await in_memory_buffer.append(audio)
+            await in_memory_buffers.mixed.append(audio)
        except MemoryError as e:
-            logger.error(f"Memory buffer full: {e}")
-            # Could implement overflow to disk here if needed
+            logger.error(f"Mixed audio buffer full: {e}")
+
+    @audio_buffer.event_handler("on_track_audio_data")
+    async def on_track_audio_data(
+        buffer, user_audio, bot_audio, sample_rate, num_channels
+    ):
+        try:
+            if user_audio:
+                await in_memory_buffers.user.append(user_audio)
+            if bot_audio:
+                await in_memory_buffers.bot.append(bot_audio)
+        except MemoryError as e:
+            logger.error(f"Track audio buffer full: {e}")
--- a/api/services/pipecat/in_memory_buffers.py
+++ b/api/services/pipecat/in_memory_buffers.py
@ -75,6 +75,27 @@ class InMemoryAudioBuffer:
        return self._total_size


+class InMemoryRecordingBuffers:
+    """Holds the mixed recording plus aligned user and bot mono tracks."""
+
+    def __init__(self, workflow_run_id: int, sample_rate: int, num_channels: int = 1):
+        self.mixed = InMemoryAudioBuffer(
+            workflow_run_id=workflow_run_id,
+            sample_rate=sample_rate,
+            num_channels=num_channels,
+        )
+        self.user = InMemoryAudioBuffer(
+            workflow_run_id=workflow_run_id,
+            sample_rate=sample_rate,
+            num_channels=1,
+        )
+        self.bot = InMemoryAudioBuffer(
+            workflow_run_id=workflow_run_id,
+            sample_rate=sample_rate,
+            num_channels=1,
+        )
+
+
 class InMemoryLogsBuffer:
    """Buffer real-time feedback events in memory during a call, then save to workflow run logs."""

--- a/api/services/workflow/dto.py
+++ b/api/services/workflow/dto.py
@ -718,6 +718,8 @@ class TriggerNodeData(BaseNodeData):
                "rsvp": "{{gathered_context.rsvp}}",
                "duration": "{{cost_info.call_duration_seconds}}",
                "recording_url": "{{recording_url}}",
+                "user_recording_url": "{{user_recording_url}}",
+                "bot_recording_url": "{{bot_recording_url}}",
                "transcript_url": "{{transcript_url}}",
            },
        },