mirror of
https://github.com/dograh-hq/dograh.git
synced 2026-06-07 07:55:16 +02:00
feat: allow recording audio in workflow builder
This commit is contained in:
parent
ac0731a374
commit
2fa4191d9b
22 changed files with 700 additions and 246 deletions
|
|
@ -351,6 +351,71 @@ class MPSServiceKeyClient:
|
|||
response=response,
|
||||
)
|
||||
|
||||
async def transcribe_audio(
|
||||
self,
|
||||
audio_data: bytes,
|
||||
filename: str = "audio.wav",
|
||||
content_type: str = "audio/wav",
|
||||
language: str = "en",
|
||||
model: str = "default",
|
||||
correlation_id: Optional[str] = None,
|
||||
organization_id: Optional[int] = None,
|
||||
created_by: Optional[str] = None,
|
||||
) -> dict:
|
||||
"""
|
||||
Transcribe an audio file via MPS STT API.
|
||||
|
||||
Args:
|
||||
audio_data: Raw audio bytes
|
||||
filename: Name of the audio file
|
||||
content_type: MIME type of the audio (e.g., audio/wav, audio/mp3)
|
||||
language: Language code for transcription (default: "en")
|
||||
model: Model tier name (default: "default")
|
||||
correlation_id: Optional correlation ID for tracking
|
||||
organization_id: Organization ID (for authenticated mode)
|
||||
created_by: User provider ID (for OSS mode)
|
||||
|
||||
Returns:
|
||||
Dictionary containing transcription result with keys like
|
||||
'transcript', 'duration_seconds', etc.
|
||||
|
||||
Raises:
|
||||
httpx.HTTPStatusError: If the API call fails
|
||||
"""
|
||||
async with httpx.AsyncClient(timeout=httpx.Timeout(60.0)) as client:
|
||||
files = {
|
||||
"file": (filename, audio_data, content_type),
|
||||
}
|
||||
data = {
|
||||
"language": language,
|
||||
"model": model,
|
||||
}
|
||||
if correlation_id:
|
||||
data["correlation_id"] = correlation_id
|
||||
|
||||
headers = self._get_headers(organization_id, created_by)
|
||||
# Remove Content-Type so httpx sets the correct multipart boundary
|
||||
headers.pop("Content-Type", None)
|
||||
|
||||
response = await client.post(
|
||||
f"{self.base_url}/api/v1/stt/transcribe",
|
||||
files=files,
|
||||
data=data,
|
||||
headers=headers,
|
||||
)
|
||||
|
||||
if response.status_code == 200:
|
||||
return response.json()
|
||||
else:
|
||||
logger.error(
|
||||
f"Failed to transcribe audio: {response.status_code} - {response.text}"
|
||||
)
|
||||
raise httpx.HTTPStatusError(
|
||||
f"Failed to transcribe audio: {response.text}",
|
||||
request=response.request,
|
||||
response=response,
|
||||
)
|
||||
|
||||
def validate_service_key(self, service_key: str) -> bool:
|
||||
"""
|
||||
Synchronously validate a Dograh service key by checking usage via MPS.
|
||||
|
|
|
|||
|
|
@ -165,49 +165,39 @@ class RealtimeFeedbackObserver(BaseObserver):
|
|||
frame = data.frame
|
||||
frame_direction = data.direction
|
||||
|
||||
logger.trace(f"{self} Received Frame: {frame} Direction: {frame_direction}")
|
||||
|
||||
# Handle pipeline termination - stop clock task
|
||||
if isinstance(frame, (EndFrame, CancelFrame, StopFrame)):
|
||||
await self._cancel_clock_task()
|
||||
return
|
||||
|
||||
# Handle interruptions - clear any queued bot text
|
||||
if isinstance(frame, InterruptionFrame):
|
||||
await self._handle_interruption()
|
||||
return
|
||||
|
||||
# Bot speaking state - WS only (ephemeral state signals, not persisted)
|
||||
if isinstance(frame, BotStartedSpeakingFrame):
|
||||
await self._send_ws(
|
||||
{"type": RealtimeFeedbackType.BOT_STARTED_SPEAKING.value, "payload": {}}
|
||||
)
|
||||
return
|
||||
if isinstance(frame, BotStoppedSpeakingFrame):
|
||||
await self._send_ws(
|
||||
{"type": RealtimeFeedbackType.BOT_STOPPED_SPEAKING.value, "payload": {}}
|
||||
)
|
||||
return
|
||||
|
||||
# User mute state - WS only (ephemeral state signals, not persisted)
|
||||
if isinstance(frame, UserMuteStartedFrame):
|
||||
await self._send_ws(
|
||||
{"type": RealtimeFeedbackType.USER_MUTE_STARTED.value, "payload": {}}
|
||||
)
|
||||
return
|
||||
if isinstance(frame, UserMuteStoppedFrame):
|
||||
await self._send_ws(
|
||||
{"type": RealtimeFeedbackType.USER_MUTE_STOPPED.value, "payload": {}}
|
||||
)
|
||||
return
|
||||
|
||||
# Skip already processed frames (frames can be observed multiple times)
|
||||
if frame.id in self._frames_seen:
|
||||
return
|
||||
self._frames_seen.add(frame.id)
|
||||
|
||||
logger.trace(f"{self} Received Frame: {frame} Direction: {frame_direction}")
|
||||
|
||||
# Handle pipeline termination - stop clock task
|
||||
if isinstance(frame, (EndFrame, CancelFrame, StopFrame)):
|
||||
await self._cancel_clock_task()
|
||||
# Handle interruptions - clear any queued bot text
|
||||
elif isinstance(frame, InterruptionFrame):
|
||||
await self._handle_interruption()
|
||||
# Bot speaking state - WS only (ephemeral state signals, not persisted)
|
||||
elif isinstance(frame, BotStartedSpeakingFrame):
|
||||
await self._send_ws(
|
||||
{"type": RealtimeFeedbackType.BOT_STARTED_SPEAKING.value, "payload": {}}
|
||||
)
|
||||
elif isinstance(frame, BotStoppedSpeakingFrame):
|
||||
await self._send_ws(
|
||||
{"type": RealtimeFeedbackType.BOT_STOPPED_SPEAKING.value, "payload": {}}
|
||||
)
|
||||
# User mute state - WS only (ephemeral state signals, not persisted)
|
||||
elif isinstance(frame, UserMuteStartedFrame):
|
||||
await self._send_ws(
|
||||
{"type": RealtimeFeedbackType.USER_MUTE_STARTED.value, "payload": {}}
|
||||
)
|
||||
elif isinstance(frame, UserMuteStoppedFrame):
|
||||
await self._send_ws(
|
||||
{"type": RealtimeFeedbackType.USER_MUTE_STOPPED.value, "payload": {}}
|
||||
)
|
||||
# Handle user transcriptions (interim) - WebSocket only
|
||||
if isinstance(frame, InterimTranscriptionFrame):
|
||||
elif isinstance(frame, InterimTranscriptionFrame):
|
||||
await self._send_ws(
|
||||
{
|
||||
"type": RealtimeFeedbackType.USER_TRANSCRIPTION.value,
|
||||
|
|
|
|||
|
|
@ -77,11 +77,8 @@ def compose_system_prompt_for_node(
|
|||
|
||||
parts = [p for p in (global_prompt, formatted_node_prompt) if p]
|
||||
|
||||
if has_recordings:
|
||||
if has_recordings and "RECORDING_ID:" in formatted_node_prompt:
|
||||
parts.append(RECORDING_RESPONSE_MODE_INSTRUCTIONS)
|
||||
# TODO: Append per-node available recordings list here once
|
||||
# Node.recording_ids is populated. The list should include
|
||||
# recording_id and a short description so the LLM can choose.
|
||||
|
||||
return "\n\n".join(parts)
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue