From 1f5229e2df35139dd8d990abd6d2bb24ac48b568 Mon Sep 17 00:00:00 2001
From: Abhishek Kumar <abhishek@a6k.me>
Date: Wed, 8 Apr 2026 22:23:14 +0530
Subject: [PATCH] chore: update prompt for pre-recorded audio generation

---
 .../pipecat/recording_router_processor.py     |  20 +-
 .../pipecat_engine_context_composer.py        |  18 +-
 api/tests/test_recording_router_processor.py  | 232 ++++++++++++++++++
 3 files changed, 259 insertions(+), 11 deletions(-)
 create mode 100644 api/tests/test_recording_router_processor.py

diff --git a/api/services/pipecat/recording_router_processor.py b/api/services/pipecat/recording_router_processor.py
index 6c22d35..d1291a9 100644
--- a/api/services/pipecat/recording_router_processor.py
+++ b/api/services/pipecat/recording_router_processor.py
@@ -67,6 +67,7 @@ class RecordingRouterProcessor(FrameProcessor):
         self._mode: Optional[str] = None  # None = detecting, "tts", "recording"
         self._recording_id_buffer = ""
         self._recording_playback_started = False
+        self._second_marker_seen = False
 
     # ------------------------------------------------------------------
     # Frame dispatch
@@ -95,14 +96,28 @@ class RecordingRouterProcessor(FrameProcessor):
             await self.push_frame(frame, direction)
             return
 
+        # --- Second marker already seen — drop everything ---
+        if self._second_marker_seen:
+            return
+
         # --- TTS mode established: pass text through normally ---
         if self._mode == "tts":
-            await self.push_frame(frame, direction)
+            if RECORDING_MARKER in frame.text:
+                before = frame.text[: frame.text.index(RECORDING_MARKER)]
+                if before:
+                    await self.push_frame(LLMTextFrame(before), direction)
+                self._second_marker_seen = True
+            else:
+                await self.push_frame(frame, direction)
             return
 
         # --- Recording mode: accumulate text and start playback ASAP ---
         if self._mode == "recording":
-            self._recording_id_buffer += frame.text
+            text = frame.text
+            if TTS_MARKER in text:
+                text = text[: text.index(TTS_MARKER)]
+                self._second_marker_seen = True
+            self._recording_id_buffer += text
             if not self._recording_playback_started:
                 buf = self._recording_id_buffer.lstrip()
                 if " " in buf:
@@ -269,3 +284,4 @@ class RecordingRouterProcessor(FrameProcessor):
         self._mode = None
         self._recording_id_buffer = ""
         self._recording_playback_started = False
+        self._second_marker_seen = False
diff --git a/api/services/workflow/pipecat_engine_context_composer.py b/api/services/workflow/pipecat_engine_context_composer.py
index 0f85c64..ffdfd77 100644
--- a/api/services/workflow/pipecat_engine_context_composer.py
+++ b/api/services/workflow/pipecat_engine_context_composer.py
@@ -26,24 +26,24 @@ TTS_MARKER = "▸"  # Generate dynamic TTS text
 
 RECORDING_RESPONSE_MODE_INSTRUCTIONS = """\
 RESPONSE MODE INSTRUCTIONS - MANDATORY FORMAT:
-Every response you generate MUST begin with a response mode indicator.
+Every response you generate MUST begin with excatcly one response mode indicator.
 You have two modes for responding:
 
 1. DYNAMIC SPEECH (▸): Generate text that will be converted to speech by TTS.
-   Format: `▸` followed by a space and your full spoken response.
+   Format: ▸ followed by a space and your full spoken response. Nothing else.
    Example: ▸ Hello! How can I help you today?
 
 2. PRE-RECORDED AUDIO (●): Play a pre-recorded audio message.
-   Format: `●` followed by a space followed by recording_id followed by provided transcript. Nothing else.
+   Format: ● followed by a space followed by recording_id followed by provided transcript. Nothing else.
    Example: ● rec_greeting_01 [ Provided Transcript ]
 
 RULES:
-- Your response MUST start with either `▸` or `●` as the very first character.
-- For `▸` (dynamic speech): Follow with a space and your full response text.
-- For `●` (pre-recorded audio): Follow with a space and the recording_id and the provided transcript. No other text.
-- Use `●` when a pre-recorded message matches the situation well.
-- Use `▸` when you need to generate a dynamic, contextual response.
-- NEVER mix modes in a single response. Choose one."""
+- Your response MUST start with either ▸ or ● as the very first character.
+- For ▸ (dynamic speech): Follow with a space and your response to be generated using TTS engine. Dont mix with ●
+- For ● (pre-recorded audio): Follow with a space and recording_id of the audio clip with its transcript. Dont mix with ▸
+- Use ● when a pre-recorded message matches the situation well.
+- Use ▸ when you need to generate a dynamic, contextual response.
+- *NEVER* mix modes in a single response, since we rely on the markers to decide whether to play using TTS or Pre-recorded audio."""
 
 
 def compose_system_prompt_for_node(
diff --git a/api/tests/test_recording_router_processor.py b/api/tests/test_recording_router_processor.py
new file mode 100644
index 0000000..24b76c2
--- /dev/null
+++ b/api/tests/test_recording_router_processor.py
@@ -0,0 +1,232 @@
+"""Tests for RecordingRouterProcessor mixed-marker handling.
+
+When the LLM generates a response containing both a TTS marker (▸) and a
+recording marker (●), only the *first* marker should be honoured. Everything
+from the second marker onward must be silently dropped so it never reaches
+downstream TTS or triggers a second recording playback.
+
+Uses pipecat's ``run_test`` helper to send frames through a real pipeline
+and inspect what arrives downstream.
+"""
+
+from typing import Optional
+
+import pytest
+
+from api.services.pipecat.recording_router_processor import (
+    RecordingRouterProcessor,
+)
+from api.services.workflow.pipecat_engine_context_composer import (
+    RECORDING_MARKER,
+    TTS_MARKER,
+)
+from pipecat.frames.frames import (
+    LLMFullResponseEndFrame,
+    LLMTextFrame,
+    TTSAudioRawFrame,
+    TTSStartedFrame,
+    TTSStoppedFrame,
+    TTSTextFrame,
+)
+from pipecat.tests import run_test
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+FAKE_AUDIO = b"\x00\x01" * 8000  # 1 second of 16-bit mono @ 16 kHz
+
+
+async def _fake_fetch(recording_id: str) -> Optional[bytes]:
+    """Stub that returns fake PCM audio for any recording_id."""
+    return FAKE_AUDIO
+
+
+def _make_processor(**kwargs) -> RecordingRouterProcessor:
+    return RecordingRouterProcessor(
+        audio_sample_rate=16_000,
+        fetch_recording_audio=kwargs.pop("fetch", _fake_fetch),
+        **kwargs,
+    )
+
+
+def _llm_tokens(tokens: list[str]) -> list[LLMTextFrame]:
+    """Build a list of LLMTextFrame from raw strings."""
+    return [LLMTextFrame(text=t) for t in tokens]
+
+
+# ---------------------------------------------------------------------------
+# Tests — single marker (baseline sanity)
+# ---------------------------------------------------------------------------
+
+
+class TestSingleMarker:
+    """Verify basic TTS-only and recording-only paths still work."""
+
+    @pytest.mark.asyncio
+    async def test_tts_only(self):
+        """▸ Hello — text should flow downstream as LLMTextFrames."""
+        processor = _make_processor()
+
+        frames_to_send = _llm_tokens(
+            [
+                TTS_MARKER,
+                " Hello, how are you today?",
+            ]
+        ) + [LLMFullResponseEndFrame()]
+
+        down, _ = await run_test(
+            processor,
+            frames_to_send=frames_to_send,
+            expected_down_frames=None,  # don't assert types, inspect manually
+        )
+
+        tts_text = "".join(
+            f.text for f in down if isinstance(f, LLMTextFrame) and not f.skip_tts
+        )
+        assert "Hello, how are you today?" in tts_text
+
+        # No audio playback
+        assert not any(isinstance(f, TTSAudioRawFrame) for f in down)
+
+    @pytest.mark.asyncio
+    async def test_recording_only(self):
+        """● rec_id [transcript] — should play audio and push TTSTextFrame
+        context."""
+        processor = _make_processor()
+
+        frames_to_send = _llm_tokens(
+            [
+                RECORDING_MARKER,
+                " abc123",
+                " [ This is the transcript. ]",
+            ]
+        ) + [LLMFullResponseEndFrame()]
+
+        down, _ = await run_test(
+            processor,
+            frames_to_send=frames_to_send,
+            expected_down_frames=None,
+        )
+
+        # Audio playback frames should be present
+        assert any(isinstance(f, TTSStartedFrame) for f in down)
+        assert any(isinstance(f, TTSAudioRawFrame) for f in down)
+        assert any(isinstance(f, TTSStoppedFrame) for f in down)
+
+        # Context TTSTextFrame with transcript
+        ctx_frames = [f for f in down if isinstance(f, TTSTextFrame)]
+        assert len(ctx_frames) == 1
+        assert "abc123" in ctx_frames[0].text
+
+
+# ---------------------------------------------------------------------------
+# Tests — mixed markers (the bug)
+# ---------------------------------------------------------------------------
+
+
+class TestMixedMarkerSuppression:
+    """The LLM sometimes generates both markers in one response.
+
+    Only the first marker should be honoured; the second marker and
+    everything after it must be dropped.
+    """
+
+    @pytest.mark.asyncio
+    async def test_tts_then_recording_marker_ignores_recording(self):
+        """▸ text... ● rec_id [transcript]
+
+        Expected: only the TTS text reaches downstream; the recording
+        marker, recording_id, and bracketed transcript are all suppressed.
+        No audio playback frames should appear.
+        """
+        processor = _make_processor()
+
+        frames_to_send = _llm_tokens(
+            [
+                TTS_MARKER,
+                " Okay, so this is regarding government changes.",
+                "\n",
+                RECORDING_MARKER,
+                " fetafnqb",
+                " [ Okay, so it's Nancy here. ]",
+            ]
+        ) + [LLMFullResponseEndFrame()]
+
+        down, _ = await run_test(
+            processor,
+            frames_to_send=frames_to_send,
+            expected_down_frames=None,
+        )
+
+        # Collect all LLMTextFrame text that was NOT marked skip_tts
+        tts_text = "".join(
+            f.text for f in down if isinstance(f, LLMTextFrame) and not f.skip_tts
+        )
+
+        # The TTS text should contain the first sentence
+        assert "government changes" in tts_text
+
+        # Nothing from the recording section should leak into TTS
+        assert RECORDING_MARKER not in tts_text
+        assert "fetafnqb" not in tts_text
+        assert "Nancy" not in tts_text
+
+        # No audio playback frames
+        assert not any(isinstance(f, TTSStartedFrame) for f in down)
+        assert not any(isinstance(f, TTSAudioRawFrame) for f in down)
+        assert not any(isinstance(f, TTSStoppedFrame) for f in down)
+
+    @pytest.mark.asyncio
+    async def test_recording_then_tts_marker_ignores_tts(self):
+        """● rec_id [transcript] ▸ text...
+
+        Expected: recording plays; the TTS marker and following text are
+        suppressed — they must not appear in any downstream frame, including
+        the TTSTextFrame context pushed at response end.
+        """
+        fetched_ids: list[str] = []
+
+        async def tracking_fetch(recording_id: str):
+            fetched_ids.append(recording_id)
+            return FAKE_AUDIO
+
+        processor = _make_processor(fetch=tracking_fetch)
+
+        frames_to_send = _llm_tokens(
+            [
+                RECORDING_MARKER,
+                " fetafnqb",
+                " [ Okay, so it's Nancy here. ]",
+                "\n",
+                TTS_MARKER,
+                " And this is the fallback TTS text.",
+            ]
+        ) + [LLMFullResponseEndFrame()]
+
+        down, _ = await run_test(
+            processor,
+            frames_to_send=frames_to_send,
+            expected_down_frames=None,
+        )
+
+        # Recording playback should have occurred
+        assert any(isinstance(f, TTSAudioRawFrame) for f in down)
+
+        # Only the correct recording_id should have been fetched
+        assert fetched_ids == ["fetafnqb"]
+
+        # The TTS text after the ▸ marker must NOT appear in any downstream frame
+        all_text = "".join(
+            f.text for f in down if isinstance(f, LLMTextFrame) and not f.skip_tts
+        )
+        assert "fallback TTS text" not in all_text
+
+        # The TTSTextFrame context pushed at response end should only contain
+        # the recording marker + recording_id + transcript, not the TTS part
+        ctx_frames = [f for f in down if isinstance(f, TTSTextFrame)]
+        assert len(ctx_frames) == 1
+        ctx_text = ctx_frames[0].text
+        assert "fetafnqb" in ctx_text
+        assert TTS_MARKER not in ctx_text
+        assert "fallback TTS text" not in ctx_text