mirror of
https://github.com/dograh-hq/dograh.git
synced 2026-06-07 07:55:16 +02:00
chore: update prompt for pre-recorded audio generation
This commit is contained in:
parent
d8ac9a80b2
commit
1f5229e2df
3 changed files with 259 additions and 11 deletions
|
|
@ -67,6 +67,7 @@ class RecordingRouterProcessor(FrameProcessor):
|
|||
self._mode: Optional[str] = None # None = detecting, "tts", "recording"
|
||||
self._recording_id_buffer = ""
|
||||
self._recording_playback_started = False
|
||||
self._second_marker_seen = False
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Frame dispatch
|
||||
|
|
@ -95,14 +96,28 @@ class RecordingRouterProcessor(FrameProcessor):
|
|||
await self.push_frame(frame, direction)
|
||||
return
|
||||
|
||||
# --- Second marker already seen — drop everything ---
|
||||
if self._second_marker_seen:
|
||||
return
|
||||
|
||||
# --- TTS mode established: pass text through normally ---
|
||||
if self._mode == "tts":
|
||||
await self.push_frame(frame, direction)
|
||||
if RECORDING_MARKER in frame.text:
|
||||
before = frame.text[: frame.text.index(RECORDING_MARKER)]
|
||||
if before:
|
||||
await self.push_frame(LLMTextFrame(before), direction)
|
||||
self._second_marker_seen = True
|
||||
else:
|
||||
await self.push_frame(frame, direction)
|
||||
return
|
||||
|
||||
# --- Recording mode: accumulate text and start playback ASAP ---
|
||||
if self._mode == "recording":
|
||||
self._recording_id_buffer += frame.text
|
||||
text = frame.text
|
||||
if TTS_MARKER in text:
|
||||
text = text[: text.index(TTS_MARKER)]
|
||||
self._second_marker_seen = True
|
||||
self._recording_id_buffer += text
|
||||
if not self._recording_playback_started:
|
||||
buf = self._recording_id_buffer.lstrip()
|
||||
if " " in buf:
|
||||
|
|
@ -269,3 +284,4 @@ class RecordingRouterProcessor(FrameProcessor):
|
|||
self._mode = None
|
||||
self._recording_id_buffer = ""
|
||||
self._recording_playback_started = False
|
||||
self._second_marker_seen = False
|
||||
|
|
|
|||
|
|
@ -26,24 +26,24 @@ TTS_MARKER = "▸" # Generate dynamic TTS text
|
|||
|
||||
RECORDING_RESPONSE_MODE_INSTRUCTIONS = """\
|
||||
RESPONSE MODE INSTRUCTIONS - MANDATORY FORMAT:
|
||||
Every response you generate MUST begin with a response mode indicator.
|
||||
Every response you generate MUST begin with excatcly one response mode indicator.
|
||||
You have two modes for responding:
|
||||
|
||||
1. DYNAMIC SPEECH (▸): Generate text that will be converted to speech by TTS.
|
||||
Format: `▸` followed by a space and your full spoken response.
|
||||
Format: ▸ followed by a space and your full spoken response. Nothing else.
|
||||
Example: ▸ Hello! How can I help you today?
|
||||
|
||||
2. PRE-RECORDED AUDIO (●): Play a pre-recorded audio message.
|
||||
Format: `●` followed by a space followed by recording_id followed by provided transcript. Nothing else.
|
||||
Format: ● followed by a space followed by recording_id followed by provided transcript. Nothing else.
|
||||
Example: ● rec_greeting_01 [ Provided Transcript ]
|
||||
|
||||
RULES:
|
||||
- Your response MUST start with either `▸` or `●` as the very first character.
|
||||
- For `▸` (dynamic speech): Follow with a space and your full response text.
|
||||
- For `●` (pre-recorded audio): Follow with a space and the recording_id and the provided transcript. No other text.
|
||||
- Use `●` when a pre-recorded message matches the situation well.
|
||||
- Use `▸` when you need to generate a dynamic, contextual response.
|
||||
- NEVER mix modes in a single response. Choose one."""
|
||||
- Your response MUST start with either ▸ or ● as the very first character.
|
||||
- For ▸ (dynamic speech): Follow with a space and your response to be generated using TTS engine. Dont mix with ●
|
||||
- For ● (pre-recorded audio): Follow with a space and recording_id of the audio clip with its transcript. Dont mix with ▸
|
||||
- Use ● when a pre-recorded message matches the situation well.
|
||||
- Use ▸ when you need to generate a dynamic, contextual response.
|
||||
- *NEVER* mix modes in a single response, since we rely on the markers to decide whether to play using TTS or Pre-recorded audio."""
|
||||
|
||||
|
||||
def compose_system_prompt_for_node(
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue