mirror of
https://github.com/dograh-hq/dograh.git
synced 2026-06-07 07:55:16 +02:00
chore: update prompt for pre-recorded audio generation
This commit is contained in:
parent
d8ac9a80b2
commit
1f5229e2df
3 changed files with 259 additions and 11 deletions
|
|
@ -67,6 +67,7 @@ class RecordingRouterProcessor(FrameProcessor):
|
|||
self._mode: Optional[str] = None # None = detecting, "tts", "recording"
|
||||
self._recording_id_buffer = ""
|
||||
self._recording_playback_started = False
|
||||
self._second_marker_seen = False
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Frame dispatch
|
||||
|
|
@ -95,14 +96,28 @@ class RecordingRouterProcessor(FrameProcessor):
|
|||
await self.push_frame(frame, direction)
|
||||
return
|
||||
|
||||
# --- Second marker already seen — drop everything ---
|
||||
if self._second_marker_seen:
|
||||
return
|
||||
|
||||
# --- TTS mode established: pass text through normally ---
|
||||
if self._mode == "tts":
|
||||
await self.push_frame(frame, direction)
|
||||
if RECORDING_MARKER in frame.text:
|
||||
before = frame.text[: frame.text.index(RECORDING_MARKER)]
|
||||
if before:
|
||||
await self.push_frame(LLMTextFrame(before), direction)
|
||||
self._second_marker_seen = True
|
||||
else:
|
||||
await self.push_frame(frame, direction)
|
||||
return
|
||||
|
||||
# --- Recording mode: accumulate text and start playback ASAP ---
|
||||
if self._mode == "recording":
|
||||
self._recording_id_buffer += frame.text
|
||||
text = frame.text
|
||||
if TTS_MARKER in text:
|
||||
text = text[: text.index(TTS_MARKER)]
|
||||
self._second_marker_seen = True
|
||||
self._recording_id_buffer += text
|
||||
if not self._recording_playback_started:
|
||||
buf = self._recording_id_buffer.lstrip()
|
||||
if " " in buf:
|
||||
|
|
@ -269,3 +284,4 @@ class RecordingRouterProcessor(FrameProcessor):
|
|||
self._mode = None
|
||||
self._recording_id_buffer = ""
|
||||
self._recording_playback_started = False
|
||||
self._second_marker_seen = False
|
||||
|
|
|
|||
|
|
@ -26,24 +26,24 @@ TTS_MARKER = "▸" # Generate dynamic TTS text
|
|||
|
||||
RECORDING_RESPONSE_MODE_INSTRUCTIONS = """\
|
||||
RESPONSE MODE INSTRUCTIONS - MANDATORY FORMAT:
|
||||
Every response you generate MUST begin with a response mode indicator.
|
||||
Every response you generate MUST begin with excatcly one response mode indicator.
|
||||
You have two modes for responding:
|
||||
|
||||
1. DYNAMIC SPEECH (▸): Generate text that will be converted to speech by TTS.
|
||||
Format: `▸` followed by a space and your full spoken response.
|
||||
Format: ▸ followed by a space and your full spoken response. Nothing else.
|
||||
Example: ▸ Hello! How can I help you today?
|
||||
|
||||
2. PRE-RECORDED AUDIO (●): Play a pre-recorded audio message.
|
||||
Format: `●` followed by a space followed by recording_id followed by provided transcript. Nothing else.
|
||||
Format: ● followed by a space followed by recording_id followed by provided transcript. Nothing else.
|
||||
Example: ● rec_greeting_01 [ Provided Transcript ]
|
||||
|
||||
RULES:
|
||||
- Your response MUST start with either `▸` or `●` as the very first character.
|
||||
- For `▸` (dynamic speech): Follow with a space and your full response text.
|
||||
- For `●` (pre-recorded audio): Follow with a space and the recording_id and the provided transcript. No other text.
|
||||
- Use `●` when a pre-recorded message matches the situation well.
|
||||
- Use `▸` when you need to generate a dynamic, contextual response.
|
||||
- NEVER mix modes in a single response. Choose one."""
|
||||
- Your response MUST start with either ▸ or ● as the very first character.
|
||||
- For ▸ (dynamic speech): Follow with a space and your response to be generated using TTS engine. Dont mix with ●
|
||||
- For ● (pre-recorded audio): Follow with a space and recording_id of the audio clip with its transcript. Dont mix with ▸
|
||||
- Use ● when a pre-recorded message matches the situation well.
|
||||
- Use ▸ when you need to generate a dynamic, contextual response.
|
||||
- *NEVER* mix modes in a single response, since we rely on the markers to decide whether to play using TTS or Pre-recorded audio."""
|
||||
|
||||
|
||||
def compose_system_prompt_for_node(
|
||||
|
|
|
|||
232
api/tests/test_recording_router_processor.py
Normal file
232
api/tests/test_recording_router_processor.py
Normal file
|
|
@ -0,0 +1,232 @@
|
|||
"""Tests for RecordingRouterProcessor mixed-marker handling.
|
||||
|
||||
When the LLM generates a response containing both a TTS marker (▸) and a
|
||||
recording marker (●), only the *first* marker should be honoured. Everything
|
||||
from the second marker onward must be silently dropped so it never reaches
|
||||
downstream TTS or triggers a second recording playback.
|
||||
|
||||
Uses pipecat's ``run_test`` helper to send frames through a real pipeline
|
||||
and inspect what arrives downstream.
|
||||
"""
|
||||
|
||||
from typing import Optional
|
||||
|
||||
import pytest
|
||||
|
||||
from api.services.pipecat.recording_router_processor import (
|
||||
RecordingRouterProcessor,
|
||||
)
|
||||
from api.services.workflow.pipecat_engine_context_composer import (
|
||||
RECORDING_MARKER,
|
||||
TTS_MARKER,
|
||||
)
|
||||
from pipecat.frames.frames import (
|
||||
LLMFullResponseEndFrame,
|
||||
LLMTextFrame,
|
||||
TTSAudioRawFrame,
|
||||
TTSStartedFrame,
|
||||
TTSStoppedFrame,
|
||||
TTSTextFrame,
|
||||
)
|
||||
from pipecat.tests import run_test
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
FAKE_AUDIO = b"\x00\x01" * 8000 # 1 second of 16-bit mono @ 16 kHz
|
||||
|
||||
|
||||
async def _fake_fetch(recording_id: str) -> Optional[bytes]:
|
||||
"""Stub that returns fake PCM audio for any recording_id."""
|
||||
return FAKE_AUDIO
|
||||
|
||||
|
||||
def _make_processor(**kwargs) -> RecordingRouterProcessor:
|
||||
return RecordingRouterProcessor(
|
||||
audio_sample_rate=16_000,
|
||||
fetch_recording_audio=kwargs.pop("fetch", _fake_fetch),
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
|
||||
def _llm_tokens(tokens: list[str]) -> list[LLMTextFrame]:
|
||||
"""Build a list of LLMTextFrame from raw strings."""
|
||||
return [LLMTextFrame(text=t) for t in tokens]
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Tests — single marker (baseline sanity)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestSingleMarker:
|
||||
"""Verify basic TTS-only and recording-only paths still work."""
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_tts_only(self):
|
||||
"""▸ Hello — text should flow downstream as LLMTextFrames."""
|
||||
processor = _make_processor()
|
||||
|
||||
frames_to_send = _llm_tokens(
|
||||
[
|
||||
TTS_MARKER,
|
||||
" Hello, how are you today?",
|
||||
]
|
||||
) + [LLMFullResponseEndFrame()]
|
||||
|
||||
down, _ = await run_test(
|
||||
processor,
|
||||
frames_to_send=frames_to_send,
|
||||
expected_down_frames=None, # don't assert types, inspect manually
|
||||
)
|
||||
|
||||
tts_text = "".join(
|
||||
f.text for f in down if isinstance(f, LLMTextFrame) and not f.skip_tts
|
||||
)
|
||||
assert "Hello, how are you today?" in tts_text
|
||||
|
||||
# No audio playback
|
||||
assert not any(isinstance(f, TTSAudioRawFrame) for f in down)
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_recording_only(self):
|
||||
"""● rec_id [transcript] — should play audio and push TTSTextFrame
|
||||
context."""
|
||||
processor = _make_processor()
|
||||
|
||||
frames_to_send = _llm_tokens(
|
||||
[
|
||||
RECORDING_MARKER,
|
||||
" abc123",
|
||||
" [ This is the transcript. ]",
|
||||
]
|
||||
) + [LLMFullResponseEndFrame()]
|
||||
|
||||
down, _ = await run_test(
|
||||
processor,
|
||||
frames_to_send=frames_to_send,
|
||||
expected_down_frames=None,
|
||||
)
|
||||
|
||||
# Audio playback frames should be present
|
||||
assert any(isinstance(f, TTSStartedFrame) for f in down)
|
||||
assert any(isinstance(f, TTSAudioRawFrame) for f in down)
|
||||
assert any(isinstance(f, TTSStoppedFrame) for f in down)
|
||||
|
||||
# Context TTSTextFrame with transcript
|
||||
ctx_frames = [f for f in down if isinstance(f, TTSTextFrame)]
|
||||
assert len(ctx_frames) == 1
|
||||
assert "abc123" in ctx_frames[0].text
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Tests — mixed markers (the bug)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestMixedMarkerSuppression:
|
||||
"""The LLM sometimes generates both markers in one response.
|
||||
|
||||
Only the first marker should be honoured; the second marker and
|
||||
everything after it must be dropped.
|
||||
"""
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_tts_then_recording_marker_ignores_recording(self):
|
||||
"""▸ text... ● rec_id [transcript]
|
||||
|
||||
Expected: only the TTS text reaches downstream; the recording
|
||||
marker, recording_id, and bracketed transcript are all suppressed.
|
||||
No audio playback frames should appear.
|
||||
"""
|
||||
processor = _make_processor()
|
||||
|
||||
frames_to_send = _llm_tokens(
|
||||
[
|
||||
TTS_MARKER,
|
||||
" Okay, so this is regarding government changes.",
|
||||
"\n",
|
||||
RECORDING_MARKER,
|
||||
" fetafnqb",
|
||||
" [ Okay, so it's Nancy here. ]",
|
||||
]
|
||||
) + [LLMFullResponseEndFrame()]
|
||||
|
||||
down, _ = await run_test(
|
||||
processor,
|
||||
frames_to_send=frames_to_send,
|
||||
expected_down_frames=None,
|
||||
)
|
||||
|
||||
# Collect all LLMTextFrame text that was NOT marked skip_tts
|
||||
tts_text = "".join(
|
||||
f.text for f in down if isinstance(f, LLMTextFrame) and not f.skip_tts
|
||||
)
|
||||
|
||||
# The TTS text should contain the first sentence
|
||||
assert "government changes" in tts_text
|
||||
|
||||
# Nothing from the recording section should leak into TTS
|
||||
assert RECORDING_MARKER not in tts_text
|
||||
assert "fetafnqb" not in tts_text
|
||||
assert "Nancy" not in tts_text
|
||||
|
||||
# No audio playback frames
|
||||
assert not any(isinstance(f, TTSStartedFrame) for f in down)
|
||||
assert not any(isinstance(f, TTSAudioRawFrame) for f in down)
|
||||
assert not any(isinstance(f, TTSStoppedFrame) for f in down)
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_recording_then_tts_marker_ignores_tts(self):
|
||||
"""● rec_id [transcript] ▸ text...
|
||||
|
||||
Expected: recording plays; the TTS marker and following text are
|
||||
suppressed — they must not appear in any downstream frame, including
|
||||
the TTSTextFrame context pushed at response end.
|
||||
"""
|
||||
fetched_ids: list[str] = []
|
||||
|
||||
async def tracking_fetch(recording_id: str):
|
||||
fetched_ids.append(recording_id)
|
||||
return FAKE_AUDIO
|
||||
|
||||
processor = _make_processor(fetch=tracking_fetch)
|
||||
|
||||
frames_to_send = _llm_tokens(
|
||||
[
|
||||
RECORDING_MARKER,
|
||||
" fetafnqb",
|
||||
" [ Okay, so it's Nancy here. ]",
|
||||
"\n",
|
||||
TTS_MARKER,
|
||||
" And this is the fallback TTS text.",
|
||||
]
|
||||
) + [LLMFullResponseEndFrame()]
|
||||
|
||||
down, _ = await run_test(
|
||||
processor,
|
||||
frames_to_send=frames_to_send,
|
||||
expected_down_frames=None,
|
||||
)
|
||||
|
||||
# Recording playback should have occurred
|
||||
assert any(isinstance(f, TTSAudioRawFrame) for f in down)
|
||||
|
||||
# Only the correct recording_id should have been fetched
|
||||
assert fetched_ids == ["fetafnqb"]
|
||||
|
||||
# The TTS text after the ▸ marker must NOT appear in any downstream frame
|
||||
all_text = "".join(
|
||||
f.text for f in down if isinstance(f, LLMTextFrame) and not f.skip_tts
|
||||
)
|
||||
assert "fallback TTS text" not in all_text
|
||||
|
||||
# The TTSTextFrame context pushed at response end should only contain
|
||||
# the recording marker + recording_id + transcript, not the TTS part
|
||||
ctx_frames = [f for f in down if isinstance(f, TTSTextFrame)]
|
||||
assert len(ctx_frames) == 1
|
||||
ctx_text = ctx_frames[0].text
|
||||
assert "fetafnqb" in ctx_text
|
||||
assert TTS_MARKER not in ctx_text
|
||||
assert "fallback TTS text" not in ctx_text
|
||||
Loading…
Add table
Add a link
Reference in a new issue