chore: update prompt for pre-recorded audio generation

This commit is contained in:
Abhishek Kumar 2026-04-08 22:23:14 +05:30
parent d8ac9a80b2
commit 1f5229e2df
3 changed files with 259 additions and 11 deletions

View file

@ -67,6 +67,7 @@ class RecordingRouterProcessor(FrameProcessor):
self._mode: Optional[str] = None # None = detecting, "tts", "recording"
self._recording_id_buffer = ""
self._recording_playback_started = False
self._second_marker_seen = False
# ------------------------------------------------------------------
# Frame dispatch
@ -95,14 +96,28 @@ class RecordingRouterProcessor(FrameProcessor):
await self.push_frame(frame, direction)
return
# --- Second marker already seen — drop everything ---
if self._second_marker_seen:
return
# --- TTS mode established: pass text through normally ---
if self._mode == "tts":
await self.push_frame(frame, direction)
if RECORDING_MARKER in frame.text:
before = frame.text[: frame.text.index(RECORDING_MARKER)]
if before:
await self.push_frame(LLMTextFrame(before), direction)
self._second_marker_seen = True
else:
await self.push_frame(frame, direction)
return
# --- Recording mode: accumulate text and start playback ASAP ---
if self._mode == "recording":
self._recording_id_buffer += frame.text
text = frame.text
if TTS_MARKER in text:
text = text[: text.index(TTS_MARKER)]
self._second_marker_seen = True
self._recording_id_buffer += text
if not self._recording_playback_started:
buf = self._recording_id_buffer.lstrip()
if " " in buf:
@ -269,3 +284,4 @@ class RecordingRouterProcessor(FrameProcessor):
self._mode = None
self._recording_id_buffer = ""
self._recording_playback_started = False
self._second_marker_seen = False

View file

@ -26,24 +26,24 @@ TTS_MARKER = "▸" # Generate dynamic TTS text
RECORDING_RESPONSE_MODE_INSTRUCTIONS = """\
RESPONSE MODE INSTRUCTIONS - MANDATORY FORMAT:
Every response you generate MUST begin with a response mode indicator.
Every response you generate MUST begin with excatcly one response mode indicator.
You have two modes for responding:
1. DYNAMIC SPEECH (): Generate text that will be converted to speech by TTS.
Format: `` followed by a space and your full spoken response.
Format: followed by a space and your full spoken response. Nothing else.
Example: Hello! How can I help you today?
2. PRE-RECORDED AUDIO (): Play a pre-recorded audio message.
Format: `` followed by a space followed by recording_id followed by provided transcript. Nothing else.
Format: followed by a space followed by recording_id followed by provided transcript. Nothing else.
Example: rec_greeting_01 [ Provided Transcript ]
RULES:
- Your response MUST start with either `` or `` as the very first character.
- For `` (dynamic speech): Follow with a space and your full response text.
- For `` (pre-recorded audio): Follow with a space and the recording_id and the provided transcript. No other text.
- Use `` when a pre-recorded message matches the situation well.
- Use `` when you need to generate a dynamic, contextual response.
- NEVER mix modes in a single response. Choose one."""
- Your response MUST start with either or as the very first character.
- For (dynamic speech): Follow with a space and your response to be generated using TTS engine. Dont mix with
- For (pre-recorded audio): Follow with a space and recording_id of the audio clip with its transcript. Dont mix with
- Use when a pre-recorded message matches the situation well.
- Use when you need to generate a dynamic, contextual response.
- *NEVER* mix modes in a single response, since we rely on the markers to decide whether to play using TTS or Pre-recorded audio."""
def compose_system_prompt_for_node(

View file

@ -0,0 +1,232 @@
"""Tests for RecordingRouterProcessor mixed-marker handling.
When the LLM generates a response containing both a TTS marker () and a
recording marker (), only the *first* marker should be honoured. Everything
from the second marker onward must be silently dropped so it never reaches
downstream TTS or triggers a second recording playback.
Uses pipecat's ``run_test`` helper to send frames through a real pipeline
and inspect what arrives downstream.
"""
from typing import Optional
import pytest
from api.services.pipecat.recording_router_processor import (
RecordingRouterProcessor,
)
from api.services.workflow.pipecat_engine_context_composer import (
RECORDING_MARKER,
TTS_MARKER,
)
from pipecat.frames.frames import (
LLMFullResponseEndFrame,
LLMTextFrame,
TTSAudioRawFrame,
TTSStartedFrame,
TTSStoppedFrame,
TTSTextFrame,
)
from pipecat.tests import run_test
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
FAKE_AUDIO = b"\x00\x01" * 8000 # 1 second of 16-bit mono @ 16 kHz
async def _fake_fetch(recording_id: str) -> Optional[bytes]:
"""Stub that returns fake PCM audio for any recording_id."""
return FAKE_AUDIO
def _make_processor(**kwargs) -> RecordingRouterProcessor:
return RecordingRouterProcessor(
audio_sample_rate=16_000,
fetch_recording_audio=kwargs.pop("fetch", _fake_fetch),
**kwargs,
)
def _llm_tokens(tokens: list[str]) -> list[LLMTextFrame]:
"""Build a list of LLMTextFrame from raw strings."""
return [LLMTextFrame(text=t) for t in tokens]
# ---------------------------------------------------------------------------
# Tests — single marker (baseline sanity)
# ---------------------------------------------------------------------------
class TestSingleMarker:
"""Verify basic TTS-only and recording-only paths still work."""
@pytest.mark.asyncio
async def test_tts_only(self):
"""▸ Hello — text should flow downstream as LLMTextFrames."""
processor = _make_processor()
frames_to_send = _llm_tokens(
[
TTS_MARKER,
" Hello, how are you today?",
]
) + [LLMFullResponseEndFrame()]
down, _ = await run_test(
processor,
frames_to_send=frames_to_send,
expected_down_frames=None, # don't assert types, inspect manually
)
tts_text = "".join(
f.text for f in down if isinstance(f, LLMTextFrame) and not f.skip_tts
)
assert "Hello, how are you today?" in tts_text
# No audio playback
assert not any(isinstance(f, TTSAudioRawFrame) for f in down)
@pytest.mark.asyncio
async def test_recording_only(self):
"""● rec_id [transcript] — should play audio and push TTSTextFrame
context."""
processor = _make_processor()
frames_to_send = _llm_tokens(
[
RECORDING_MARKER,
" abc123",
" [ This is the transcript. ]",
]
) + [LLMFullResponseEndFrame()]
down, _ = await run_test(
processor,
frames_to_send=frames_to_send,
expected_down_frames=None,
)
# Audio playback frames should be present
assert any(isinstance(f, TTSStartedFrame) for f in down)
assert any(isinstance(f, TTSAudioRawFrame) for f in down)
assert any(isinstance(f, TTSStoppedFrame) for f in down)
# Context TTSTextFrame with transcript
ctx_frames = [f for f in down if isinstance(f, TTSTextFrame)]
assert len(ctx_frames) == 1
assert "abc123" in ctx_frames[0].text
# ---------------------------------------------------------------------------
# Tests — mixed markers (the bug)
# ---------------------------------------------------------------------------
class TestMixedMarkerSuppression:
"""The LLM sometimes generates both markers in one response.
Only the first marker should be honoured; the second marker and
everything after it must be dropped.
"""
@pytest.mark.asyncio
async def test_tts_then_recording_marker_ignores_recording(self):
"""▸ text... ● rec_id [transcript]
Expected: only the TTS text reaches downstream; the recording
marker, recording_id, and bracketed transcript are all suppressed.
No audio playback frames should appear.
"""
processor = _make_processor()
frames_to_send = _llm_tokens(
[
TTS_MARKER,
" Okay, so this is regarding government changes.",
"\n",
RECORDING_MARKER,
" fetafnqb",
" [ Okay, so it's Nancy here. ]",
]
) + [LLMFullResponseEndFrame()]
down, _ = await run_test(
processor,
frames_to_send=frames_to_send,
expected_down_frames=None,
)
# Collect all LLMTextFrame text that was NOT marked skip_tts
tts_text = "".join(
f.text for f in down if isinstance(f, LLMTextFrame) and not f.skip_tts
)
# The TTS text should contain the first sentence
assert "government changes" in tts_text
# Nothing from the recording section should leak into TTS
assert RECORDING_MARKER not in tts_text
assert "fetafnqb" not in tts_text
assert "Nancy" not in tts_text
# No audio playback frames
assert not any(isinstance(f, TTSStartedFrame) for f in down)
assert not any(isinstance(f, TTSAudioRawFrame) for f in down)
assert not any(isinstance(f, TTSStoppedFrame) for f in down)
@pytest.mark.asyncio
async def test_recording_then_tts_marker_ignores_tts(self):
"""● rec_id [transcript] ▸ text...
Expected: recording plays; the TTS marker and following text are
suppressed they must not appear in any downstream frame, including
the TTSTextFrame context pushed at response end.
"""
fetched_ids: list[str] = []
async def tracking_fetch(recording_id: str):
fetched_ids.append(recording_id)
return FAKE_AUDIO
processor = _make_processor(fetch=tracking_fetch)
frames_to_send = _llm_tokens(
[
RECORDING_MARKER,
" fetafnqb",
" [ Okay, so it's Nancy here. ]",
"\n",
TTS_MARKER,
" And this is the fallback TTS text.",
]
) + [LLMFullResponseEndFrame()]
down, _ = await run_test(
processor,
frames_to_send=frames_to_send,
expected_down_frames=None,
)
# Recording playback should have occurred
assert any(isinstance(f, TTSAudioRawFrame) for f in down)
# Only the correct recording_id should have been fetched
assert fetched_ids == ["fetafnqb"]
# The TTS text after the ▸ marker must NOT appear in any downstream frame
all_text = "".join(
f.text for f in down if isinstance(f, LLMTextFrame) and not f.skip_tts
)
assert "fallback TTS text" not in all_text
# The TTSTextFrame context pushed at response end should only contain
# the recording marker + recording_id + transcript, not the TTS part
ctx_frames = [f for f in down if isinstance(f, TTSTextFrame)]
assert len(ctx_frames) == 1
ctx_text = ctx_frames[0].text
assert "fetafnqb" in ctx_text
assert TTS_MARKER not in ctx_text
assert "fallback TTS text" not in ctx_text