chore: update prompt for pre-recorded audio generation

2026-07-25 12:01:04 +02:00 · 2026-04-08 22:23:14 +05:30 · 2026-04-08 22:23:14 +05:30 · 1f5229e2df
commit 1f5229e2df
parent d8ac9a80b2
3 changed files with 259 additions and 11 deletions
--- a/api/services/pipecat/recording_router_processor.py
+++ b/api/services/pipecat/recording_router_processor.py
@ -67,6 +67,7 @@ class RecordingRouterProcessor(FrameProcessor):
        self._mode: Optional[str] = None  # None = detecting, "tts", "recording"
        self._recording_id_buffer = ""
        self._recording_playback_started = False
+        self._second_marker_seen = False

    # ------------------------------------------------------------------
    # Frame dispatch
@ -95,14 +96,28 @@ class RecordingRouterProcessor(FrameProcessor):
            await self.push_frame(frame, direction)
            return

+        # --- Second marker already seen — drop everything ---
+        if self._second_marker_seen:
+            return
+
        # --- TTS mode established: pass text through normally ---
        if self._mode == "tts":
-            await self.push_frame(frame, direction)
+            if RECORDING_MARKER in frame.text:
+                before = frame.text[: frame.text.index(RECORDING_MARKER)]
+                if before:
+                    await self.push_frame(LLMTextFrame(before), direction)
+                self._second_marker_seen = True
+            else:
+                await self.push_frame(frame, direction)
            return

        # --- Recording mode: accumulate text and start playback ASAP ---
        if self._mode == "recording":
-            self._recording_id_buffer += frame.text
+            text = frame.text
+            if TTS_MARKER in text:
+                text = text[: text.index(TTS_MARKER)]
+                self._second_marker_seen = True
+            self._recording_id_buffer += text
            if not self._recording_playback_started:
                buf = self._recording_id_buffer.lstrip()
                if " " in buf:
@ -269,3 +284,4 @@ class RecordingRouterProcessor(FrameProcessor):
        self._mode = None
        self._recording_id_buffer = ""
        self._recording_playback_started = False
+        self._second_marker_seen = False
--- a/api/services/workflow/pipecat_engine_context_composer.py
+++ b/api/services/workflow/pipecat_engine_context_composer.py
@ -26,24 +26,24 @@ TTS_MARKER = "▸"  # Generate dynamic TTS text

 RECORDING_RESPONSE_MODE_INSTRUCTIONS = """\
 RESPONSE MODE INSTRUCTIONS - MANDATORY FORMAT:
-Every response you generate MUST begin with a response mode indicator.
+Every response you generate MUST begin with excatcly one response mode indicator.
 You have two modes for responding:

 1. DYNAMIC SPEECH (▸): Generate text that will be converted to speech by TTS.
-   Format: `▸` followed by a space and your full spoken response.
+   Format: ▸ followed by a space and your full spoken response. Nothing else.
   Example: ▸ Hello! How can I help you today?

 2. PRE-RECORDED AUDIO (●): Play a pre-recorded audio message.
-   Format: `●` followed by a space followed by recording_id followed by provided transcript. Nothing else.
+   Format: ● followed by a space followed by recording_id followed by provided transcript. Nothing else.
   Example: ● rec_greeting_01 [ Provided Transcript ]

 RULES:
- Your response MUST start with either `▸` or `●` as the very first character.
- For `▸` (dynamic speech): Follow with a space and your full response text.
- For `●` (pre-recorded audio): Follow with a space and the recording_id and the provided transcript. No other text.
- Use `●` when a pre-recorded message matches the situation well.
- Use `▸` when you need to generate a dynamic, contextual response.
- NEVER mix modes in a single response. Choose one."""
+- Your response MUST start with either ▸ or ● as the very first character.
+- For ▸ (dynamic speech): Follow with a space and your response to be generated using TTS engine. Dont mix with ●
+- For ● (pre-recorded audio): Follow with a space and recording_id of the audio clip with its transcript. Dont mix with ▸
+- Use ● when a pre-recorded message matches the situation well.
+- Use ▸ when you need to generate a dynamic, contextual response.
+- *NEVER* mix modes in a single response, since we rely on the markers to decide whether to play using TTS or Pre-recorded audio."""


 def compose_system_prompt_for_node(