feat: add recording audio option in tool and node transitions (#232)

* feat: allow uploading recording as part of node transition * feat: allow recordings in tool transitions * chore: fix tests
2026-06-22 08:38:13 +02:00 · 2026-04-10 17:53:42 +05:30 · 2026-04-10 17:53:42 +05:30 · 7c245051d2
commit 7c245051d2
parent 3f19a16e7f
54 changed files with 3575 additions and 640 deletions
--- a/api/services/workflow/pipecat_engine_custom_tools.py
+++ b/api/services/workflow/pipecat_engine_custom_tools.py
@ -16,6 +16,7 @@ from loguru import logger

 from api.db import db_client
 from api.enums import ToolCategory, WorkflowRunMode
+from api.services.pipecat.audio_playback import play_audio, play_audio_loop
 from api.services.telephony.call_transfer_manager import get_call_transfer_manager
 from api.services.telephony.factory import get_telephony_provider
 from api.services.telephony.transfer_event_protocol import TransferContext
@ -27,7 +28,6 @@ from api.services.workflow.tools.custom_tool import (
    execute_http_tool,
    tool_to_function_schema,
 )
-from api.utils.hold_audio import play_hold_audio_loop
 from pipecat.adapters.schemas.function_schema import FunctionSchema
 from pipecat.frames.frames import (
    FunctionCallResultProperties,
@ -77,6 +77,45 @@ class CustomToolManager:
        self._engine = engine
        self._organization_id: Optional[int] = None

+    async def _play_config_message(
+        self, config: dict, *, append_to_context: bool = False
+    ) -> bool:
+        """Play a message from tool config — text or pre-recorded audio.
+
+        Returns True if a message was queued, False otherwise.
+        """
+        message_type = config.get("messageType", "none")
+
+        if message_type == "audio":
+            recording_pk = config.get("audioRecordingId")
+            if recording_pk and self._engine._fetch_recording_audio:
+                result = await self._engine._fetch_recording_audio(
+                    recording_pk=int(recording_pk)
+                )
+                if result:
+                    await play_audio(
+                        result.audio,
+                        sample_rate=self._engine._audio_config.pipeline_sample_rate
+                        if self._engine._audio_config
+                        else 16000,
+                        queue_frame=self._engine._transport_output.queue_frame,
+                        transcript=result.transcript,
+                    )
+                    return True
+                else:
+                    logger.warning(f"Failed to fetch recording pk={recording_pk}")
+            return False
+
+        if message_type == "custom":
+            custom_message = config.get("customMessage", "")
+            if custom_message:
+                await self._engine.task.queue_frame(
+                    TTSSpeakFrame(custom_message, append_to_context=append_to_context)
+                )
+                return True
+
+        return False
+
    async def get_organization_id(self) -> Optional[int]:
        """Get and cache the organization ID from workflow run."""
        if self._organization_id is None:
@ -250,9 +289,30 @@ class CustomToolManager:

            try:
                # Queue custom message before executing the API call
+                # Queue custom message (text or audio) before executing the API call
                config = tool.definition.get("config", {}) if tool.definition else {}
+                custom_msg_type = config.get("customMessageType", "text")
                custom_message = config.get("customMessage", "")
-                if custom_message:
+                if custom_msg_type == "audio":
+                    recording_pk = config.get("customMessageRecordingId")
+                    if recording_pk and self._engine._fetch_recording_audio:
+                        logger.info(
+                            f"Playing audio message before HTTP tool: pk={recording_pk}"
+                        )
+                        self._engine._queued_speech_mute_state = "waiting"
+                        result = await self._engine._fetch_recording_audio(
+                            recording_pk=int(recording_pk)
+                        )
+                        if result:
+                            await play_audio(
+                                result.audio,
+                                sample_rate=self._engine._audio_config.pipeline_sample_rate
+                                if self._engine._audio_config
+                                else 16000,
+                                queue_frame=self._engine._transport_output.queue_frame,
+                                transcript=result.transcript,
+                            )
+                elif custom_message:
                    logger.info(
                        f"Playing custom message before HTTP tool: {custom_message}"
                    )
@ -299,8 +359,6 @@ class CustomToolManager:
            try:
                # Get the end call configuration
                config = tool.definition.get("config", {})
-                message_type = config.get("messageType", "none")
-                custom_message = config.get("customMessage", "")

                # Handle end call reason if enabled
                end_call_reason_enabled = config.get("endCallReason", False)
@ -322,10 +380,8 @@ class CustomToolManager:
                    properties=properties,
                )

-                if message_type == "custom" and custom_message:
-                    # Queue the custom message to be spoken
-                    logger.info(f"Playing custom goodbye message: {custom_message}")
-                    await self._engine.task.queue_frame(TTSSpeakFrame(custom_message))
+                played = await self._play_config_message(config)
+                if played:
                    # End the call after the message (not immediately)
                    await self._engine.end_call_with_reason(
                        EndTaskReason.END_CALL_TOOL_REASON.value,
@ -370,8 +426,6 @@ class CustomToolManager:
                # Get the transfer call configuration
                config = tool.definition.get("config", {})
                destination = config.get("destination", "")
-                message_type = config.get("messageType", "none")
-                custom_message = config.get("customMessage", "")
                timeout_seconds = config.get(
                    "timeout", 30
                )  # Default 30 seconds if not configured
@ -443,10 +497,9 @@ class CustomToolManager:
                        )
                        return

-                if message_type == "custom" and custom_message:
-                    logger.info(f"Playing pre-transfer message: {custom_message}")
+                played = await self._play_config_message(config)
+                if played:
                    self._engine._queued_speech_mute_state = "waiting"
-                    await self._engine.task.queue_frame(TTSSpeakFrame(custom_message))

                # Get organization ID for provider configuration
                organization_id = await self.get_organization_id()
@ -537,10 +590,10 @@ class CustomToolManager:

                    # Start hold music as background task
                    hold_music_task = asyncio.create_task(
-                        play_hold_audio_loop(
-                            self._engine.task,
-                            hold_music_stop_event,
-                            sample_rate,
+                        play_audio_loop(
+                            stop_event=hold_music_stop_event,
+                            sample_rate=sample_rate,
+                            queue_frame=self._engine._transport_output.queue_frame,
                        )
                    )