fix: migrate from custom audio recorder to native AudioBuffer (#115)

* fix: update to pipecat VM Detector * fix: refactor to remove audio synchronizer * feat: add speechmatics as STT
2026-06-22 08:38:13 +02:00 · 2026-01-08 18:03:26 +05:30 · 2026-01-08 18:03:26 +05:30 · edf0fa4fbc
commit edf0fa4fbc
parent 31521008cf
12 changed files with 193 additions and 591 deletions
--- a/api/services/looptalk/core/pipeline_builder.py
+++ b/api/services/looptalk/core/pipeline_builder.py
@ -83,29 +83,31 @@ class LoopTalkPipelineBuilder:

        logger.debug(f"Created services for {role}: STT={stt}, LLM={llm}, TTS={tts}")

-        audio_buffer, audio_synchronizer, transcript, context = (
-            create_pipeline_components(audio_config)
-        )
-
-        context_aggregator = LLMContextAggregatorPair(context)
-
        # Get workflow graph
        workflow_graph = WorkflowGraph(
            ReactFlowDTO.model_validate(workflow.workflow_definition_with_fallback)
        )

-        # Create engine
+        # Create engine first (needed for create_pipeline_components)
        engine = PipecatEngine(
-            task=None,  # Will be set after creating the task
            llm=llm,
-            context=context,
            tts=tts,
            workflow=workflow_graph,
            call_context_vars={},
-            audio_buffer=audio_buffer,
            workflow_run_id=None,  # LoopTalk doesn't have workflow runs
        )

+        # Create pipeline components with audio configuration and engine
+        audio_buffer, transcript, context = create_pipeline_components(
+            audio_config, engine
+        )
+
+        # Set the context and audio_buffer after creation
+        engine.set_context(context)
+        engine.set_audio_buffer(audio_buffer)
+
+        context_aggregator = LLMContextAggregatorPair(context)
+
        # Create STT mute filter
        stt_mute_filter = STTMuteFilter(
            config=STTMuteConfig(
@ -124,19 +126,13 @@ class LoopTalkPipelineBuilder:
        user_context_aggregator = context_aggregator.user()
        assistant_context_aggregator = context_aggregator.assistant()

-        # Register processors with synchronizer for merged audio
-        audio_synchronizer.register_processors(
-            audio_buffer.input(), audio_buffer.output()
-        )
-
        # Get audio streamer for real-time streaming
        audio_streamer = get_or_create_audio_streamer(str(test_session_id), role)

-        # Create pipeline
+        # Create pipeline with AudioBufferProcessor after transport.output()
        pipeline = Pipeline(
            [
                transport.input(),
-                audio_buffer.input(),  # Record input audio
                audio_streamer,  # Stream audio to connected clients
                stt_mute_filter,
                stt,
@ -146,7 +142,7 @@ class LoopTalkPipelineBuilder:
                pipeline_engine_callback_processor,
                tts,
                transport.output(),
-                audio_buffer.output(),  # Record output audio
+                audio_buffer,  # AudioBufferProcessor - records both input and output audio
                transcript.assistant(),
                assistant_context_aggregator,
            ]
@ -157,13 +153,12 @@ class LoopTalkPipelineBuilder:
        task = create_pipeline_task(pipeline, conversation_id, audio_config)

        # Set the task on the engine
-        engine.task = task
+        engine.set_task(task)

        return {
            "task": task,
            "engine": engine,
            "audio_buffer": audio_buffer,
-            "audio_synchronizer": audio_synchronizer,
            "transcript": transcript,
            "assistant_context_aggregator": assistant_context_aggregator,
            "audio_streamer": audio_streamer,
--- a/api/services/looptalk/orchestrator.py
+++ b/api/services/looptalk/orchestrator.py
@ -245,7 +245,6 @@ class LoopTalkTestOrchestrator:
        engine = pipeline_info["engine"]
        task = pipeline_info["task"]
        audio_buffer = pipeline_info["audio_buffer"]
-        audio_synchronizer = pipeline_info["audio_synchronizer"]
        transcript = pipeline_info["transcript"]
        assistant_context_aggregator = pipeline_info["assistant_context_aggregator"]

@ -255,7 +254,6 @@ class LoopTalkTestOrchestrator:
            logger.debug(f"LoopTalk {role} client connected - initializing workflow")
            # Start audio recording
            await audio_buffer.start_recording()
-            await audio_synchronizer.start_recording()
            await engine.initialize()

        @transport.event_handler("on_client_disconnected")
@ -263,7 +261,6 @@ class LoopTalkTestOrchestrator:
            logger.debug(f"LoopTalk {role} client disconnected")
            # Stop audio recording
            await audio_buffer.stop_recording()
-            await audio_synchronizer.stop_recording()

            # Handle disconnect propagation - stop the other agent too
            await self.session_manager.handle_agent_disconnect(
@ -274,11 +271,11 @@ class LoopTalkTestOrchestrator:

        # Register custom audio and transcript handlers for LoopTalk
        await self._register_looptalk_handlers(
-            audio_synchronizer, transcript, test_session_id, role
+            audio_buffer, transcript, test_session_id, role
        )

    async def _register_looptalk_handlers(
-        self, audio_synchronizer, transcript, test_session_id: int, role: str
+        self, audio_buffer, transcript, test_session_id: int, role: str
    ):
        """Register LoopTalk-specific handlers for audio and transcript recording"""

@ -288,9 +285,9 @@ class LoopTalkTestOrchestrator:
        audio_metadata = {"sample_rate": None, "num_channels": None}

        # Audio handler - writes directly to PCM file
-        @audio_synchronizer.event_handler("on_merged_audio")
-        async def on_merged_audio(_, pcm, sample_rate, num_channels):
-            if not pcm:
+        @audio_buffer.event_handler("on_audio_data")
+        async def on_audio_data(buffer, audio, sample_rate, num_channels):
+            if not audio:
                return

            # Store metadata on first write
@ -301,7 +298,7 @@ class LoopTalkTestOrchestrator:
            # Append PCM data to temporary file
            try:
                with open(paths["temp_audio"], "ab") as f:
-                    f.write(pcm)
+                    f.write(audio)
            except Exception as e:
                logger.error(
                    f"Failed to write audio for {role} in session {test_session_id}: {e}"