chore: update prompt for pre-recorded audio generation

2026-07-25 12:01:04 +02:00 · 2026-04-08 22:23:14 +05:30 · 2026-04-08 22:23:14 +05:30 · 1f5229e2df
commit 1f5229e2df
parent d8ac9a80b2
3 changed files with 259 additions and 11 deletions
--- a/api/services/pipecat/recording_router_processor.py
+++ b/api/services/pipecat/recording_router_processor.py
@ -67,6 +67,7 @@ class RecordingRouterProcessor(FrameProcessor):
        self._mode: Optional[str] = None  # None = detecting, "tts", "recording"
        self._recording_id_buffer = ""
        self._recording_playback_started = False
+        self._second_marker_seen = False

    # ------------------------------------------------------------------
    # Frame dispatch
@ -95,14 +96,28 @@ class RecordingRouterProcessor(FrameProcessor):
            await self.push_frame(frame, direction)
            return

+        # --- Second marker already seen — drop everything ---
+        if self._second_marker_seen:
+            return
+
        # --- TTS mode established: pass text through normally ---
        if self._mode == "tts":
-            await self.push_frame(frame, direction)
+            if RECORDING_MARKER in frame.text:
+                before = frame.text[: frame.text.index(RECORDING_MARKER)]
+                if before:
+                    await self.push_frame(LLMTextFrame(before), direction)
+                self._second_marker_seen = True
+            else:
+                await self.push_frame(frame, direction)
            return

        # --- Recording mode: accumulate text and start playback ASAP ---
        if self._mode == "recording":
-            self._recording_id_buffer += frame.text
+            text = frame.text
+            if TTS_MARKER in text:
+                text = text[: text.index(TTS_MARKER)]
+                self._second_marker_seen = True
+            self._recording_id_buffer += text
            if not self._recording_playback_started:
                buf = self._recording_id_buffer.lstrip()
                if " " in buf:
@ -269,3 +284,4 @@ class RecordingRouterProcessor(FrameProcessor):
        self._mode = None
        self._recording_id_buffer = ""
        self._recording_playback_started = False
+        self._second_marker_seen = False
--- a/api/services/workflow/pipecat_engine_context_composer.py
+++ b/api/services/workflow/pipecat_engine_context_composer.py
@ -26,24 +26,24 @@ TTS_MARKER = "▸"  # Generate dynamic TTS text

 RECORDING_RESPONSE_MODE_INSTRUCTIONS = """\
 RESPONSE MODE INSTRUCTIONS - MANDATORY FORMAT:
-Every response you generate MUST begin with a response mode indicator.
+Every response you generate MUST begin with excatcly one response mode indicator.
 You have two modes for responding:

 1. DYNAMIC SPEECH (▸): Generate text that will be converted to speech by TTS.
-   Format: `▸` followed by a space and your full spoken response.
+   Format: ▸ followed by a space and your full spoken response. Nothing else.
   Example: ▸ Hello! How can I help you today?

 2. PRE-RECORDED AUDIO (●): Play a pre-recorded audio message.
-   Format: `●` followed by a space followed by recording_id followed by provided transcript. Nothing else.
+   Format: ● followed by a space followed by recording_id followed by provided transcript. Nothing else.
   Example: ● rec_greeting_01 [ Provided Transcript ]

 RULES:
- Your response MUST start with either `▸` or `●` as the very first character.
- For `▸` (dynamic speech): Follow with a space and your full response text.
- For `●` (pre-recorded audio): Follow with a space and the recording_id and the provided transcript. No other text.
- Use `●` when a pre-recorded message matches the situation well.
- Use `▸` when you need to generate a dynamic, contextual response.
- NEVER mix modes in a single response. Choose one."""
+- Your response MUST start with either ▸ or ● as the very first character.
+- For ▸ (dynamic speech): Follow with a space and your response to be generated using TTS engine. Dont mix with ●
+- For ● (pre-recorded audio): Follow with a space and recording_id of the audio clip with its transcript. Dont mix with ▸
+- Use ● when a pre-recorded message matches the situation well.
+- Use ▸ when you need to generate a dynamic, contextual response.
+- *NEVER* mix modes in a single response, since we rely on the markers to decide whether to play using TTS or Pre-recorded audio."""


 def compose_system_prompt_for_node(
--- a/api/tests/test_recording_router_processor.py
+++ b/api/tests/test_recording_router_processor.py
@ -0,0 +1,232 @@
+"""Tests for RecordingRouterProcessor mixed-marker handling.
+
+When the LLM generates a response containing both a TTS marker (▸) and a
+recording marker (●), only the *first* marker should be honoured. Everything
+from the second marker onward must be silently dropped so it never reaches
+downstream TTS or triggers a second recording playback.
+
+Uses pipecat's ``run_test`` helper to send frames through a real pipeline
+and inspect what arrives downstream.
+"""
+
+from typing import Optional
+
+import pytest
+
+from api.services.pipecat.recording_router_processor import (
+    RecordingRouterProcessor,
+)
+from api.services.workflow.pipecat_engine_context_composer import (
+    RECORDING_MARKER,
+    TTS_MARKER,
+)
+from pipecat.frames.frames import (
+    LLMFullResponseEndFrame,
+    LLMTextFrame,
+    TTSAudioRawFrame,
+    TTSStartedFrame,
+    TTSStoppedFrame,
+    TTSTextFrame,
+)
+from pipecat.tests import run_test
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+FAKE_AUDIO = b"\x00\x01" * 8000  # 1 second of 16-bit mono @ 16 kHz
+
+
+async def _fake_fetch(recording_id: str) -> Optional[bytes]:
+    """Stub that returns fake PCM audio for any recording_id."""
+    return FAKE_AUDIO
+
+
+def _make_processor(**kwargs) -> RecordingRouterProcessor:
+    return RecordingRouterProcessor(
+        audio_sample_rate=16_000,
+        fetch_recording_audio=kwargs.pop("fetch", _fake_fetch),
+        **kwargs,
+    )
+
+
+def _llm_tokens(tokens: list[str]) -> list[LLMTextFrame]:
+    """Build a list of LLMTextFrame from raw strings."""
+    return [LLMTextFrame(text=t) for t in tokens]
+
+
+# ---------------------------------------------------------------------------
+# Tests — single marker (baseline sanity)
+# ---------------------------------------------------------------------------
+
+
+class TestSingleMarker:
+    """Verify basic TTS-only and recording-only paths still work."""
+
+    @pytest.mark.asyncio
+    async def test_tts_only(self):
+        """▸ Hello — text should flow downstream as LLMTextFrames."""
+        processor = _make_processor()
+
+        frames_to_send = _llm_tokens(
+            [
+                TTS_MARKER,
+                " Hello, how are you today?",
+            ]
+        ) + [LLMFullResponseEndFrame()]
+
+        down, _ = await run_test(
+            processor,
+            frames_to_send=frames_to_send,
+            expected_down_frames=None,  # don't assert types, inspect manually
+        )
+
+        tts_text = "".join(
+            f.text for f in down if isinstance(f, LLMTextFrame) and not f.skip_tts
+        )
+        assert "Hello, how are you today?" in tts_text
+
+        # No audio playback
+        assert not any(isinstance(f, TTSAudioRawFrame) for f in down)
+
+    @pytest.mark.asyncio
+    async def test_recording_only(self):
+        """● rec_id [transcript] — should play audio and push TTSTextFrame
+        context."""
+        processor = _make_processor()
+
+        frames_to_send = _llm_tokens(
+            [
+                RECORDING_MARKER,
+                " abc123",
+                " [ This is the transcript. ]",
+            ]
+        ) + [LLMFullResponseEndFrame()]
+
+        down, _ = await run_test(
+            processor,
+            frames_to_send=frames_to_send,
+            expected_down_frames=None,
+        )
+
+        # Audio playback frames should be present
+        assert any(isinstance(f, TTSStartedFrame) for f in down)
+        assert any(isinstance(f, TTSAudioRawFrame) for f in down)
+        assert any(isinstance(f, TTSStoppedFrame) for f in down)
+
+        # Context TTSTextFrame with transcript
+        ctx_frames = [f for f in down if isinstance(f, TTSTextFrame)]
+        assert len(ctx_frames) == 1
+        assert "abc123" in ctx_frames[0].text
+
+
+# ---------------------------------------------------------------------------
+# Tests — mixed markers (the bug)
+# ---------------------------------------------------------------------------
+
+
+class TestMixedMarkerSuppression:
+    """The LLM sometimes generates both markers in one response.
+
+    Only the first marker should be honoured; the second marker and
+    everything after it must be dropped.
+    """
+
+    @pytest.mark.asyncio
+    async def test_tts_then_recording_marker_ignores_recording(self):
+        """▸ text... ● rec_id [transcript]
+
+        Expected: only the TTS text reaches downstream; the recording
+        marker, recording_id, and bracketed transcript are all suppressed.
+        No audio playback frames should appear.
+        """
+        processor = _make_processor()
+
+        frames_to_send = _llm_tokens(
+            [
+                TTS_MARKER,
+                " Okay, so this is regarding government changes.",
+                "\n",
+                RECORDING_MARKER,
+                " fetafnqb",
+                " [ Okay, so it's Nancy here. ]",
+            ]
+        ) + [LLMFullResponseEndFrame()]
+
+        down, _ = await run_test(
+            processor,
+            frames_to_send=frames_to_send,
+            expected_down_frames=None,
+        )
+
+        # Collect all LLMTextFrame text that was NOT marked skip_tts
+        tts_text = "".join(
+            f.text for f in down if isinstance(f, LLMTextFrame) and not f.skip_tts
+        )
+
+        # The TTS text should contain the first sentence
+        assert "government changes" in tts_text
+
+        # Nothing from the recording section should leak into TTS
+        assert RECORDING_MARKER not in tts_text
+        assert "fetafnqb" not in tts_text
+        assert "Nancy" not in tts_text
+
+        # No audio playback frames
+        assert not any(isinstance(f, TTSStartedFrame) for f in down)
+        assert not any(isinstance(f, TTSAudioRawFrame) for f in down)
+        assert not any(isinstance(f, TTSStoppedFrame) for f in down)
+
+    @pytest.mark.asyncio
+    async def test_recording_then_tts_marker_ignores_tts(self):
+        """● rec_id [transcript] ▸ text...
+
+        Expected: recording plays; the TTS marker and following text are
+        suppressed — they must not appear in any downstream frame, including
+        the TTSTextFrame context pushed at response end.
+        """
+        fetched_ids: list[str] = []
+
+        async def tracking_fetch(recording_id: str):
+            fetched_ids.append(recording_id)
+            return FAKE_AUDIO
+
+        processor = _make_processor(fetch=tracking_fetch)
+
+        frames_to_send = _llm_tokens(
+            [
+                RECORDING_MARKER,
+                " fetafnqb",
+                " [ Okay, so it's Nancy here. ]",
+                "\n",
+                TTS_MARKER,
+                " And this is the fallback TTS text.",
+            ]
+        ) + [LLMFullResponseEndFrame()]
+
+        down, _ = await run_test(
+            processor,
+            frames_to_send=frames_to_send,
+            expected_down_frames=None,
+        )
+
+        # Recording playback should have occurred
+        assert any(isinstance(f, TTSAudioRawFrame) for f in down)
+
+        # Only the correct recording_id should have been fetched
+        assert fetched_ids == ["fetafnqb"]
+
+        # The TTS text after the ▸ marker must NOT appear in any downstream frame
+        all_text = "".join(
+            f.text for f in down if isinstance(f, LLMTextFrame) and not f.skip_tts
+        )
+        assert "fallback TTS text" not in all_text
+
+        # The TTSTextFrame context pushed at response end should only contain
+        # the recording marker + recording_id + transcript, not the TTS part
+        ctx_frames = [f for f in down if isinstance(f, TTSTextFrame)]
+        assert len(ctx_frames) == 1
+        ctx_text = ctx_frames[0].text
+        assert "fetafnqb" in ctx_text
+        assert TTS_MARKER not in ctx_text
+        assert "fallback TTS text" not in ctx_text