chore: update prompt for pre-recorded audio generation

2026-07-04 10:52:17 +02:00 · 2026-04-08 22:23:14 +05:30 · 2026-04-08 22:23:14 +05:30 · 1f5229e2df
commit 1f5229e2df
parent d8ac9a80b2
3 changed files with 259 additions and 11 deletions
--- a/api/services/pipecat/recording_router_processor.py
+++ b/api/services/pipecat/recording_router_processor.py
@ -67,6 +67,7 @@ class RecordingRouterProcessor(FrameProcessor):
        self._mode: Optional[str] = None  # None = detecting, "tts", "recording"
        self._recording_id_buffer = ""
        self._recording_playback_started = False
        self._second_marker_seen = False
    # ------------------------------------------------------------------
    # Frame dispatch
@ -95,14 +96,28 @@ class RecordingRouterProcessor(FrameProcessor):
            await self.push_frame(frame, direction)
            return
        # --- Second marker already seen — drop everything ---
        if self._second_marker_seen:
            return
        # --- TTS mode established: pass text through normally ---
        if self._mode == "tts":
-            await self.push_frame(frame, direction)
+            if RECORDING_MARKER in frame.text:
                before = frame.text[: frame.text.index(RECORDING_MARKER)]
                if before:
                    await self.push_frame(LLMTextFrame(before), direction)
                self._second_marker_seen = True
            else:
                await self.push_frame(frame, direction)
            return
        # --- Recording mode: accumulate text and start playback ASAP ---
        if self._mode == "recording":
-            self._recording_id_buffer += frame.text
+            text = frame.text
            if TTS_MARKER in text:
                text = text[: text.index(TTS_MARKER)]
                self._second_marker_seen = True
            self._recording_id_buffer += text
            if not self._recording_playback_started:
                buf = self._recording_id_buffer.lstrip()
                if " " in buf:
@ -269,3 +284,4 @@ class RecordingRouterProcessor(FrameProcessor):
        self._mode = None
        self._recording_id_buffer = ""
        self._recording_playback_started = False
        self._second_marker_seen = False
--- a/api/services/workflow/pipecat_engine_context_composer.py
+++ b/api/services/workflow/pipecat_engine_context_composer.py
@ -26,24 +26,24 @@ TTS_MARKER = "▸"  # Generate dynamic TTS text
 RECORDING_RESPONSE_MODE_INSTRUCTIONS = """\
 RESPONSE MODE INSTRUCTIONS - MANDATORY FORMAT:
-Every response you generate MUST begin with a response mode indicator.
+Every response you generate MUST begin with excatcly one response mode indicator.
 You have two modes for responding:
 1. DYNAMIC SPEECH (▸): Generate text that will be converted to speech by TTS.
-   Format: `▸` followed by a space and your full spoken response.
+   Format: ▸ followed by a space and your full spoken response. Nothing else.
   Example: ▸ Hello! How can I help you today?
 2. PRE-RECORDED AUDIO (●): Play a pre-recorded audio message.
-   Format: `●` followed by a space followed by recording_id followed by provided transcript. Nothing else.
+   Format: ● followed by a space followed by recording_id followed by provided transcript. Nothing else.
   Example: ● rec_greeting_01 [ Provided Transcript ]
 RULES:
- Your response MUST start with either `▸` or `●` as the very first character.
+- Your response MUST start with either ▸ or ● as the very first character.
- For `▸` (dynamic speech): Follow with a space and your full response text.
+- For ▸ (dynamic speech): Follow with a space and your response to be generated using TTS engine. Dont mix with ●
- For `●` (pre-recorded audio): Follow with a space and the recording_id and the provided transcript. No other text.
+- For ● (pre-recorded audio): Follow with a space and recording_id of the audio clip with its transcript. Dont mix with ▸
- Use `●` when a pre-recorded message matches the situation well.
+- Use ● when a pre-recorded message matches the situation well.
- Use `▸` when you need to generate a dynamic, contextual response.
+- Use ▸ when you need to generate a dynamic, contextual response.
- NEVER mix modes in a single response. Choose one."""
+- *NEVER* mix modes in a single response, since we rely on the markers to decide whether to play using TTS or Pre-recorded audio."""
 def compose_system_prompt_for_node(
--- a/api/tests/test_recording_router_processor.py
+++ b/api/tests/test_recording_router_processor.py
@ -0,0 +1,232 @@
 """Tests for RecordingRouterProcessor mixed-marker handling.
 When the LLM generates a response containing both a TTS marker (▸) and a
 recording marker (●), only the *first* marker should be honoured. Everything
 from the second marker onward must be silently dropped so it never reaches
 downstream TTS or triggers a second recording playback.
 Uses pipecat's ``run_test`` helper to send frames through a real pipeline
 and inspect what arrives downstream.
 """
 from typing import Optional
 import pytest
 from api.services.pipecat.recording_router_processor import (
    RecordingRouterProcessor,
 )
 from api.services.workflow.pipecat_engine_context_composer import (
    RECORDING_MARKER,
    TTS_MARKER,
 )
 from pipecat.frames.frames import (
    LLMFullResponseEndFrame,
    LLMTextFrame,
    TTSAudioRawFrame,
    TTSStartedFrame,
    TTSStoppedFrame,
    TTSTextFrame,
 )
 from pipecat.tests import run_test
 # ---------------------------------------------------------------------------
 # Helpers
 # ---------------------------------------------------------------------------
 FAKE_AUDIO = b"\x00\x01" * 8000  # 1 second of 16-bit mono @ 16 kHz
 async def _fake_fetch(recording_id: str) -> Optional[bytes]:
    """Stub that returns fake PCM audio for any recording_id."""
    return FAKE_AUDIO
 def _make_processor(**kwargs) -> RecordingRouterProcessor:
    return RecordingRouterProcessor(
        audio_sample_rate=16_000,
        fetch_recording_audio=kwargs.pop("fetch", _fake_fetch),
        **kwargs,
    )
 def _llm_tokens(tokens: list[str]) -> list[LLMTextFrame]:
    """Build a list of LLMTextFrame from raw strings."""
    return [LLMTextFrame(text=t) for t in tokens]
 # ---------------------------------------------------------------------------
 # Tests — single marker (baseline sanity)
 # ---------------------------------------------------------------------------
 class TestSingleMarker:
    """Verify basic TTS-only and recording-only paths still work."""
    @pytest.mark.asyncio
    async def test_tts_only(self):
        """▸ Hello — text should flow downstream as LLMTextFrames."""
        processor = _make_processor()
        frames_to_send = _llm_tokens(
            [
                TTS_MARKER,
                " Hello, how are you today?",
            ]
        ) + [LLMFullResponseEndFrame()]
        down, _ = await run_test(
            processor,
            frames_to_send=frames_to_send,
            expected_down_frames=None,  # don't assert types, inspect manually
        )
        tts_text = "".join(
            f.text for f in down if isinstance(f, LLMTextFrame) and not f.skip_tts
        )
        assert "Hello, how are you today?" in tts_text
        # No audio playback
        assert not any(isinstance(f, TTSAudioRawFrame) for f in down)
    @pytest.mark.asyncio
    async def test_recording_only(self):
        """● rec_id [transcript] — should play audio and push TTSTextFrame
        context."""
        processor = _make_processor()
        frames_to_send = _llm_tokens(
            [
                RECORDING_MARKER,
                " abc123",
                " [ This is the transcript. ]",
            ]
        ) + [LLMFullResponseEndFrame()]
        down, _ = await run_test(
            processor,
            frames_to_send=frames_to_send,
            expected_down_frames=None,
        )
        # Audio playback frames should be present
        assert any(isinstance(f, TTSStartedFrame) for f in down)
        assert any(isinstance(f, TTSAudioRawFrame) for f in down)
        assert any(isinstance(f, TTSStoppedFrame) for f in down)
        # Context TTSTextFrame with transcript
        ctx_frames = [f for f in down if isinstance(f, TTSTextFrame)]
        assert len(ctx_frames) == 1
        assert "abc123" in ctx_frames[0].text
 # ---------------------------------------------------------------------------
 # Tests — mixed markers (the bug)
 # ---------------------------------------------------------------------------
 class TestMixedMarkerSuppression:
    """The LLM sometimes generates both markers in one response.
    Only the first marker should be honoured; the second marker and
    everything after it must be dropped.
    """
    @pytest.mark.asyncio
    async def test_tts_then_recording_marker_ignores_recording(self):
        """▸ text... ● rec_id [transcript]
        Expected: only the TTS text reaches downstream; the recording
        marker, recording_id, and bracketed transcript are all suppressed.
        No audio playback frames should appear.
        """
        processor = _make_processor()
        frames_to_send = _llm_tokens(
            [
                TTS_MARKER,
                " Okay, so this is regarding government changes.",
                "\n",
                RECORDING_MARKER,
                " fetafnqb",
                " [ Okay, so it's Nancy here. ]",
            ]
        ) + [LLMFullResponseEndFrame()]
        down, _ = await run_test(
            processor,
            frames_to_send=frames_to_send,
            expected_down_frames=None,
        )
        # Collect all LLMTextFrame text that was NOT marked skip_tts
        tts_text = "".join(
            f.text for f in down if isinstance(f, LLMTextFrame) and not f.skip_tts
        )
        # The TTS text should contain the first sentence
        assert "government changes" in tts_text
        # Nothing from the recording section should leak into TTS
        assert RECORDING_MARKER not in tts_text
        assert "fetafnqb" not in tts_text
        assert "Nancy" not in tts_text
        # No audio playback frames
        assert not any(isinstance(f, TTSStartedFrame) for f in down)
        assert not any(isinstance(f, TTSAudioRawFrame) for f in down)
        assert not any(isinstance(f, TTSStoppedFrame) for f in down)
    @pytest.mark.asyncio
    async def test_recording_then_tts_marker_ignores_tts(self):
        """● rec_id [transcript] ▸ text...
        Expected: recording plays; the TTS marker and following text are
        suppressed — they must not appear in any downstream frame, including
        the TTSTextFrame context pushed at response end.
        """
        fetched_ids: list[str] = []
        async def tracking_fetch(recording_id: str):
            fetched_ids.append(recording_id)
            return FAKE_AUDIO
        processor = _make_processor(fetch=tracking_fetch)
        frames_to_send = _llm_tokens(
            [
                RECORDING_MARKER,
                " fetafnqb",
                " [ Okay, so it's Nancy here. ]",
                "\n",
                TTS_MARKER,
                " And this is the fallback TTS text.",
            ]
        ) + [LLMFullResponseEndFrame()]
        down, _ = await run_test(
            processor,
            frames_to_send=frames_to_send,
            expected_down_frames=None,
        )
        # Recording playback should have occurred
        assert any(isinstance(f, TTSAudioRawFrame) for f in down)
        # Only the correct recording_id should have been fetched
        assert fetched_ids == ["fetafnqb"]
        # The TTS text after the ▸ marker must NOT appear in any downstream frame
        all_text = "".join(
            f.text for f in down if isinstance(f, LLMTextFrame) and not f.skip_tts
        )
        assert "fallback TTS text" not in all_text
        # The TTSTextFrame context pushed at response end should only contain
        # the recording marker + recording_id + transcript, not the TTS part
        ctx_frames = [f for f in down if isinstance(f, TTSTextFrame)]
        assert len(ctx_frames) == 1
        ctx_text = ctx_frames[0].text
        assert "fetafnqb" in ctx_text
        assert TTS_MARKER not in ctx_text
        assert "fallback TTS text" not in ctx_text