mirror of
https://github.com/dograh-hq/dograh.git
synced 2026-07-04 10:52:17 +02:00
chore: update prompt for pre-recorded audio generation
This commit is contained in:
parent
d8ac9a80b2
commit
1f5229e2df
3 changed files with 259 additions and 11 deletions
|
|
@ -67,6 +67,7 @@ class RecordingRouterProcessor(FrameProcessor):
|
||||||
self._mode: Optional[str] = None # None = detecting, "tts", "recording"
|
self._mode: Optional[str] = None # None = detecting, "tts", "recording"
|
||||||
self._recording_id_buffer = ""
|
self._recording_id_buffer = ""
|
||||||
self._recording_playback_started = False
|
self._recording_playback_started = False
|
||||||
|
self._second_marker_seen = False
|
||||||
|
|
||||||
# ------------------------------------------------------------------
|
# ------------------------------------------------------------------
|
||||||
# Frame dispatch
|
# Frame dispatch
|
||||||
|
|
@ -95,14 +96,28 @@ class RecordingRouterProcessor(FrameProcessor):
|
||||||
await self.push_frame(frame, direction)
|
await self.push_frame(frame, direction)
|
||||||
return
|
return
|
||||||
|
|
||||||
|
# --- Second marker already seen — drop everything ---
|
||||||
|
if self._second_marker_seen:
|
||||||
|
return
|
||||||
|
|
||||||
# --- TTS mode established: pass text through normally ---
|
# --- TTS mode established: pass text through normally ---
|
||||||
if self._mode == "tts":
|
if self._mode == "tts":
|
||||||
await self.push_frame(frame, direction)
|
if RECORDING_MARKER in frame.text:
|
||||||
|
before = frame.text[: frame.text.index(RECORDING_MARKER)]
|
||||||
|
if before:
|
||||||
|
await self.push_frame(LLMTextFrame(before), direction)
|
||||||
|
self._second_marker_seen = True
|
||||||
|
else:
|
||||||
|
await self.push_frame(frame, direction)
|
||||||
return
|
return
|
||||||
|
|
||||||
# --- Recording mode: accumulate text and start playback ASAP ---
|
# --- Recording mode: accumulate text and start playback ASAP ---
|
||||||
if self._mode == "recording":
|
if self._mode == "recording":
|
||||||
self._recording_id_buffer += frame.text
|
text = frame.text
|
||||||
|
if TTS_MARKER in text:
|
||||||
|
text = text[: text.index(TTS_MARKER)]
|
||||||
|
self._second_marker_seen = True
|
||||||
|
self._recording_id_buffer += text
|
||||||
if not self._recording_playback_started:
|
if not self._recording_playback_started:
|
||||||
buf = self._recording_id_buffer.lstrip()
|
buf = self._recording_id_buffer.lstrip()
|
||||||
if " " in buf:
|
if " " in buf:
|
||||||
|
|
@ -269,3 +284,4 @@ class RecordingRouterProcessor(FrameProcessor):
|
||||||
self._mode = None
|
self._mode = None
|
||||||
self._recording_id_buffer = ""
|
self._recording_id_buffer = ""
|
||||||
self._recording_playback_started = False
|
self._recording_playback_started = False
|
||||||
|
self._second_marker_seen = False
|
||||||
|
|
|
||||||
|
|
@ -26,24 +26,24 @@ TTS_MARKER = "▸" # Generate dynamic TTS text
|
||||||
|
|
||||||
RECORDING_RESPONSE_MODE_INSTRUCTIONS = """\
|
RECORDING_RESPONSE_MODE_INSTRUCTIONS = """\
|
||||||
RESPONSE MODE INSTRUCTIONS - MANDATORY FORMAT:
|
RESPONSE MODE INSTRUCTIONS - MANDATORY FORMAT:
|
||||||
Every response you generate MUST begin with a response mode indicator.
|
Every response you generate MUST begin with excatcly one response mode indicator.
|
||||||
You have two modes for responding:
|
You have two modes for responding:
|
||||||
|
|
||||||
1. DYNAMIC SPEECH (▸): Generate text that will be converted to speech by TTS.
|
1. DYNAMIC SPEECH (▸): Generate text that will be converted to speech by TTS.
|
||||||
Format: `▸` followed by a space and your full spoken response.
|
Format: ▸ followed by a space and your full spoken response. Nothing else.
|
||||||
Example: ▸ Hello! How can I help you today?
|
Example: ▸ Hello! How can I help you today?
|
||||||
|
|
||||||
2. PRE-RECORDED AUDIO (●): Play a pre-recorded audio message.
|
2. PRE-RECORDED AUDIO (●): Play a pre-recorded audio message.
|
||||||
Format: `●` followed by a space followed by recording_id followed by provided transcript. Nothing else.
|
Format: ● followed by a space followed by recording_id followed by provided transcript. Nothing else.
|
||||||
Example: ● rec_greeting_01 [ Provided Transcript ]
|
Example: ● rec_greeting_01 [ Provided Transcript ]
|
||||||
|
|
||||||
RULES:
|
RULES:
|
||||||
- Your response MUST start with either `▸` or `●` as the very first character.
|
- Your response MUST start with either ▸ or ● as the very first character.
|
||||||
- For `▸` (dynamic speech): Follow with a space and your full response text.
|
- For ▸ (dynamic speech): Follow with a space and your response to be generated using TTS engine. Dont mix with ●
|
||||||
- For `●` (pre-recorded audio): Follow with a space and the recording_id and the provided transcript. No other text.
|
- For ● (pre-recorded audio): Follow with a space and recording_id of the audio clip with its transcript. Dont mix with ▸
|
||||||
- Use `●` when a pre-recorded message matches the situation well.
|
- Use ● when a pre-recorded message matches the situation well.
|
||||||
- Use `▸` when you need to generate a dynamic, contextual response.
|
- Use ▸ when you need to generate a dynamic, contextual response.
|
||||||
- NEVER mix modes in a single response. Choose one."""
|
- *NEVER* mix modes in a single response, since we rely on the markers to decide whether to play using TTS or Pre-recorded audio."""
|
||||||
|
|
||||||
|
|
||||||
def compose_system_prompt_for_node(
|
def compose_system_prompt_for_node(
|
||||||
|
|
|
||||||
232
api/tests/test_recording_router_processor.py
Normal file
232
api/tests/test_recording_router_processor.py
Normal file
|
|
@ -0,0 +1,232 @@
|
||||||
|
"""Tests for RecordingRouterProcessor mixed-marker handling.
|
||||||
|
|
||||||
|
When the LLM generates a response containing both a TTS marker (▸) and a
|
||||||
|
recording marker (●), only the *first* marker should be honoured. Everything
|
||||||
|
from the second marker onward must be silently dropped so it never reaches
|
||||||
|
downstream TTS or triggers a second recording playback.
|
||||||
|
|
||||||
|
Uses pipecat's ``run_test`` helper to send frames through a real pipeline
|
||||||
|
and inspect what arrives downstream.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from api.services.pipecat.recording_router_processor import (
|
||||||
|
RecordingRouterProcessor,
|
||||||
|
)
|
||||||
|
from api.services.workflow.pipecat_engine_context_composer import (
|
||||||
|
RECORDING_MARKER,
|
||||||
|
TTS_MARKER,
|
||||||
|
)
|
||||||
|
from pipecat.frames.frames import (
|
||||||
|
LLMFullResponseEndFrame,
|
||||||
|
LLMTextFrame,
|
||||||
|
TTSAudioRawFrame,
|
||||||
|
TTSStartedFrame,
|
||||||
|
TTSStoppedFrame,
|
||||||
|
TTSTextFrame,
|
||||||
|
)
|
||||||
|
from pipecat.tests import run_test
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Helpers
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
FAKE_AUDIO = b"\x00\x01" * 8000 # 1 second of 16-bit mono @ 16 kHz
|
||||||
|
|
||||||
|
|
||||||
|
async def _fake_fetch(recording_id: str) -> Optional[bytes]:
|
||||||
|
"""Stub that returns fake PCM audio for any recording_id."""
|
||||||
|
return FAKE_AUDIO
|
||||||
|
|
||||||
|
|
||||||
|
def _make_processor(**kwargs) -> RecordingRouterProcessor:
|
||||||
|
return RecordingRouterProcessor(
|
||||||
|
audio_sample_rate=16_000,
|
||||||
|
fetch_recording_audio=kwargs.pop("fetch", _fake_fetch),
|
||||||
|
**kwargs,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _llm_tokens(tokens: list[str]) -> list[LLMTextFrame]:
|
||||||
|
"""Build a list of LLMTextFrame from raw strings."""
|
||||||
|
return [LLMTextFrame(text=t) for t in tokens]
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Tests — single marker (baseline sanity)
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
class TestSingleMarker:
|
||||||
|
"""Verify basic TTS-only and recording-only paths still work."""
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_tts_only(self):
|
||||||
|
"""▸ Hello — text should flow downstream as LLMTextFrames."""
|
||||||
|
processor = _make_processor()
|
||||||
|
|
||||||
|
frames_to_send = _llm_tokens(
|
||||||
|
[
|
||||||
|
TTS_MARKER,
|
||||||
|
" Hello, how are you today?",
|
||||||
|
]
|
||||||
|
) + [LLMFullResponseEndFrame()]
|
||||||
|
|
||||||
|
down, _ = await run_test(
|
||||||
|
processor,
|
||||||
|
frames_to_send=frames_to_send,
|
||||||
|
expected_down_frames=None, # don't assert types, inspect manually
|
||||||
|
)
|
||||||
|
|
||||||
|
tts_text = "".join(
|
||||||
|
f.text for f in down if isinstance(f, LLMTextFrame) and not f.skip_tts
|
||||||
|
)
|
||||||
|
assert "Hello, how are you today?" in tts_text
|
||||||
|
|
||||||
|
# No audio playback
|
||||||
|
assert not any(isinstance(f, TTSAudioRawFrame) for f in down)
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_recording_only(self):
|
||||||
|
"""● rec_id [transcript] — should play audio and push TTSTextFrame
|
||||||
|
context."""
|
||||||
|
processor = _make_processor()
|
||||||
|
|
||||||
|
frames_to_send = _llm_tokens(
|
||||||
|
[
|
||||||
|
RECORDING_MARKER,
|
||||||
|
" abc123",
|
||||||
|
" [ This is the transcript. ]",
|
||||||
|
]
|
||||||
|
) + [LLMFullResponseEndFrame()]
|
||||||
|
|
||||||
|
down, _ = await run_test(
|
||||||
|
processor,
|
||||||
|
frames_to_send=frames_to_send,
|
||||||
|
expected_down_frames=None,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Audio playback frames should be present
|
||||||
|
assert any(isinstance(f, TTSStartedFrame) for f in down)
|
||||||
|
assert any(isinstance(f, TTSAudioRawFrame) for f in down)
|
||||||
|
assert any(isinstance(f, TTSStoppedFrame) for f in down)
|
||||||
|
|
||||||
|
# Context TTSTextFrame with transcript
|
||||||
|
ctx_frames = [f for f in down if isinstance(f, TTSTextFrame)]
|
||||||
|
assert len(ctx_frames) == 1
|
||||||
|
assert "abc123" in ctx_frames[0].text
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Tests — mixed markers (the bug)
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
class TestMixedMarkerSuppression:
|
||||||
|
"""The LLM sometimes generates both markers in one response.
|
||||||
|
|
||||||
|
Only the first marker should be honoured; the second marker and
|
||||||
|
everything after it must be dropped.
|
||||||
|
"""
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_tts_then_recording_marker_ignores_recording(self):
|
||||||
|
"""▸ text... ● rec_id [transcript]
|
||||||
|
|
||||||
|
Expected: only the TTS text reaches downstream; the recording
|
||||||
|
marker, recording_id, and bracketed transcript are all suppressed.
|
||||||
|
No audio playback frames should appear.
|
||||||
|
"""
|
||||||
|
processor = _make_processor()
|
||||||
|
|
||||||
|
frames_to_send = _llm_tokens(
|
||||||
|
[
|
||||||
|
TTS_MARKER,
|
||||||
|
" Okay, so this is regarding government changes.",
|
||||||
|
"\n",
|
||||||
|
RECORDING_MARKER,
|
||||||
|
" fetafnqb",
|
||||||
|
" [ Okay, so it's Nancy here. ]",
|
||||||
|
]
|
||||||
|
) + [LLMFullResponseEndFrame()]
|
||||||
|
|
||||||
|
down, _ = await run_test(
|
||||||
|
processor,
|
||||||
|
frames_to_send=frames_to_send,
|
||||||
|
expected_down_frames=None,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Collect all LLMTextFrame text that was NOT marked skip_tts
|
||||||
|
tts_text = "".join(
|
||||||
|
f.text for f in down if isinstance(f, LLMTextFrame) and not f.skip_tts
|
||||||
|
)
|
||||||
|
|
||||||
|
# The TTS text should contain the first sentence
|
||||||
|
assert "government changes" in tts_text
|
||||||
|
|
||||||
|
# Nothing from the recording section should leak into TTS
|
||||||
|
assert RECORDING_MARKER not in tts_text
|
||||||
|
assert "fetafnqb" not in tts_text
|
||||||
|
assert "Nancy" not in tts_text
|
||||||
|
|
||||||
|
# No audio playback frames
|
||||||
|
assert not any(isinstance(f, TTSStartedFrame) for f in down)
|
||||||
|
assert not any(isinstance(f, TTSAudioRawFrame) for f in down)
|
||||||
|
assert not any(isinstance(f, TTSStoppedFrame) for f in down)
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_recording_then_tts_marker_ignores_tts(self):
|
||||||
|
"""● rec_id [transcript] ▸ text...
|
||||||
|
|
||||||
|
Expected: recording plays; the TTS marker and following text are
|
||||||
|
suppressed — they must not appear in any downstream frame, including
|
||||||
|
the TTSTextFrame context pushed at response end.
|
||||||
|
"""
|
||||||
|
fetched_ids: list[str] = []
|
||||||
|
|
||||||
|
async def tracking_fetch(recording_id: str):
|
||||||
|
fetched_ids.append(recording_id)
|
||||||
|
return FAKE_AUDIO
|
||||||
|
|
||||||
|
processor = _make_processor(fetch=tracking_fetch)
|
||||||
|
|
||||||
|
frames_to_send = _llm_tokens(
|
||||||
|
[
|
||||||
|
RECORDING_MARKER,
|
||||||
|
" fetafnqb",
|
||||||
|
" [ Okay, so it's Nancy here. ]",
|
||||||
|
"\n",
|
||||||
|
TTS_MARKER,
|
||||||
|
" And this is the fallback TTS text.",
|
||||||
|
]
|
||||||
|
) + [LLMFullResponseEndFrame()]
|
||||||
|
|
||||||
|
down, _ = await run_test(
|
||||||
|
processor,
|
||||||
|
frames_to_send=frames_to_send,
|
||||||
|
expected_down_frames=None,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Recording playback should have occurred
|
||||||
|
assert any(isinstance(f, TTSAudioRawFrame) for f in down)
|
||||||
|
|
||||||
|
# Only the correct recording_id should have been fetched
|
||||||
|
assert fetched_ids == ["fetafnqb"]
|
||||||
|
|
||||||
|
# The TTS text after the ▸ marker must NOT appear in any downstream frame
|
||||||
|
all_text = "".join(
|
||||||
|
f.text for f in down if isinstance(f, LLMTextFrame) and not f.skip_tts
|
||||||
|
)
|
||||||
|
assert "fallback TTS text" not in all_text
|
||||||
|
|
||||||
|
# The TTSTextFrame context pushed at response end should only contain
|
||||||
|
# the recording marker + recording_id + transcript, not the TTS part
|
||||||
|
ctx_frames = [f for f in down if isinstance(f, TTSTextFrame)]
|
||||||
|
assert len(ctx_frames) == 1
|
||||||
|
ctx_text = ctx_frames[0].text
|
||||||
|
assert "fetafnqb" in ctx_text
|
||||||
|
assert TTS_MARKER not in ctx_text
|
||||||
|
assert "fallback TTS text" not in ctx_text
|
||||||
Loading…
Add table
Add a link
Reference in a new issue