mirror of
https://github.com/dograh-hq/dograh.git
synced 2026-06-16 08:25:18 +02:00
fix: migrate from custom audio recorder to native AudioBuffer (#115)
* fix: update to pipecat VM Detector * fix: refactor to remove audio synchronizer * feat: add speechmatics as STT
This commit is contained in:
parent
31521008cf
commit
edf0fa4fbc
12 changed files with 193 additions and 591 deletions
|
|
@ -1,19 +1,14 @@
|
|||
from typing import TYPE_CHECKING, Any, Awaitable, Callable, Optional, Union
|
||||
|
||||
from api.constants import DEPLOYMENT_MODE, ENABLE_TRACING, VOICEMAIL_RECORDING_DURATION
|
||||
from api.services.workflow.disposition_mapper import (
|
||||
apply_disposition_mapping,
|
||||
get_organization_id_from_workflow_run,
|
||||
)
|
||||
from api.services.workflow.pipecat_engine_voicemail_detector import (
|
||||
VoicemailDetector,
|
||||
)
|
||||
from api.services.workflow.workflow import Node, WorkflowGraph
|
||||
from pipecat.frames.frames import (
|
||||
CancelFrame,
|
||||
EndFrame,
|
||||
FunctionCallResultProperties,
|
||||
LLMContextFrame,
|
||||
TTSSpeakFrame,
|
||||
)
|
||||
from pipecat.pipeline.task import PipelineTask
|
||||
|
|
@ -93,11 +88,6 @@ class PipecatEngine:
|
|||
# access to _context
|
||||
self._variable_extraction_manager = None
|
||||
|
||||
# Voicemail detection state
|
||||
self._detect_voicemail = False
|
||||
self._voicemail_detector = None
|
||||
self._voicemail_detection_task: Optional[asyncio.Task] = None
|
||||
|
||||
# Lazy loaded built-in function schemas
|
||||
self._builtin_function_schemas: Optional[list[dict]] = None
|
||||
|
||||
|
|
@ -172,8 +162,6 @@ class PipecatEngine:
|
|||
|
||||
await self.set_node(self.workflow.start_node_id)
|
||||
|
||||
# Trigger initial LLM generation
|
||||
await self.task.queue_frame(LLMContextFrame(self.context))
|
||||
logger.debug(f"{self.__class__.__name__} initialized")
|
||||
except Exception as e:
|
||||
logger.error(f"Error initializing {self.__class__.__name__}: {e}")
|
||||
|
|
@ -388,43 +376,6 @@ class PipecatEngine:
|
|||
|
||||
async def _handle_start_node(self, node: Node) -> None:
|
||||
"""Handle start node execution."""
|
||||
# Handle voicemail detection setup (before any returns)
|
||||
# Lets check ENABLE_TRACING to make sure we have prompt access from
|
||||
# langfuse
|
||||
if node.detect_voicemail and DEPLOYMENT_MODE == "saas" and ENABLE_TRACING:
|
||||
if not self._audio_buffer:
|
||||
logger.warning(
|
||||
"Voicemail detection enabled but no audio buffer available - skipping detection"
|
||||
)
|
||||
else:
|
||||
logger.debug(
|
||||
"Start node has detect_voicemail enabled - setting up audio-based detector"
|
||||
)
|
||||
self._detect_voicemail = True
|
||||
|
||||
self._voicemail_detector = VoicemailDetector(
|
||||
detection_duration=VOICEMAIL_RECORDING_DURATION,
|
||||
workflow_run_id=self._workflow_run_id,
|
||||
)
|
||||
|
||||
# Register audio handler on the audio buffer input processor
|
||||
audio_input = self._audio_buffer.input()
|
||||
|
||||
@audio_input.event_handler("on_input_audio_data")
|
||||
async def handle_voicemail_audio(
|
||||
processor, pcm, sample_rate, num_channels
|
||||
):
|
||||
if (
|
||||
self._voicemail_detector
|
||||
and self._voicemail_detector.is_detecting
|
||||
):
|
||||
await self._voicemail_detector.handle_audio_data(
|
||||
processor, pcm, sample_rate, num_channels
|
||||
)
|
||||
|
||||
# Start detection
|
||||
await self._voicemail_detector.start_detection(self)
|
||||
|
||||
# Check if delayed start is enabled
|
||||
if node.delayed_start:
|
||||
# Use configured duration or default to 3 seconds
|
||||
|
|
@ -745,8 +696,4 @@ class PipecatEngine:
|
|||
):
|
||||
self._user_response_timeout_task.cancel()
|
||||
|
||||
# Stop voicemail detection if active
|
||||
if self._voicemail_detector and hasattr(
|
||||
self._voicemail_detector, "stop_detection"
|
||||
):
|
||||
await self._voicemail_detector.stop_detection()
|
||||
# Note: Native VoicemailDetector cleanup is handled by the pipeline
|
||||
|
|
|
|||
|
|
@ -1,441 +0,0 @@
|
|||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import io
|
||||
import json
|
||||
import os
|
||||
import tempfile
|
||||
import wave
|
||||
from typing import TYPE_CHECKING, Optional
|
||||
|
||||
from langfuse import get_client
|
||||
from loguru import logger
|
||||
from openai import AsyncOpenAI
|
||||
from opentelemetry import context as otel_context
|
||||
|
||||
from api.db import db_client
|
||||
from api.services.pipecat.tracing_config import is_tracing_enabled
|
||||
from api.tasks.arq import enqueue_job
|
||||
from api.tasks.function_names import FunctionNames
|
||||
from pipecat.utils.enums import EndTaskReason
|
||||
from pipecat.utils.tracing.context_registry import get_current_turn_context
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from api.services.workflow.pipecat_engine import PipecatEngine
|
||||
|
||||
|
||||
DEFAULT_VOICEMAIL_PROMPT = """
|
||||
You are analyzing the beginning of a phone call to determine if it's a voicemail greeting.
|
||||
|
||||
Common voicemail indicators:
|
||||
- "You've reached the voicemail of..."
|
||||
- "Please leave a message after the beep"
|
||||
- "I'm not available right now"
|
||||
- "Press 1 to leave a message"
|
||||
- Robotic or pre-recorded voice quality mentioned
|
||||
- Background music or hold music references
|
||||
|
||||
Transcript: {transcript}
|
||||
|
||||
Respond with a JSON object:
|
||||
{
|
||||
"is_voicemail": true/false,
|
||||
"confidence": 0.0-1.0,
|
||||
"reasoning": "Brief explanation"
|
||||
}
|
||||
"""
|
||||
|
||||
|
||||
class VoicemailDetector:
|
||||
"""
|
||||
Autonomous voicemail detection system that operates independently of the main pipeline.
|
||||
"""
|
||||
|
||||
def __init__(self, detection_duration: float = 15.0, workflow_run_id: int = None):
|
||||
self.detection_duration = detection_duration
|
||||
self.audio_buffer = bytearray()
|
||||
self.is_detecting = False
|
||||
self.workflow_run_id = workflow_run_id
|
||||
self._langfuse_client = get_client()
|
||||
|
||||
# We will set the sample rate when we receive the audio packet
|
||||
self._sample_rate = None
|
||||
|
||||
# Task management
|
||||
self._detection_task: Optional[asyncio.Task] = None
|
||||
self._is_cancelled = False
|
||||
self._engine: Optional[PipecatEngine] = None
|
||||
|
||||
# Event for audio collection completion
|
||||
self._audio_collected_event = asyncio.Event()
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Utility helpers
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def _current_duration_seconds(self) -> float:
|
||||
"""Return the duration (in seconds) of the audio currently in the buffer."""
|
||||
if self._sample_rate:
|
||||
return len(self.audio_buffer) / (self._sample_rate * 2)
|
||||
return 0.0
|
||||
|
||||
async def handle_audio_data(
|
||||
self, processor, pcm: bytes, sample_rate: int, num_channels: int
|
||||
):
|
||||
"""Handle incoming audio data without affecting pipeline."""
|
||||
if not self.is_detecting or self._is_cancelled:
|
||||
return
|
||||
|
||||
# Store the actual sample rate from the first audio packet
|
||||
if self._sample_rate is None:
|
||||
self._sample_rate = sample_rate
|
||||
logger.debug(f"Voicemail detector using sample rate: {sample_rate}")
|
||||
|
||||
# Add to buffer without resampling
|
||||
self.audio_buffer.extend(pcm)
|
||||
|
||||
# Check if we've collected enough audio
|
||||
current_duration = self._current_duration_seconds()
|
||||
if current_duration >= self.detection_duration:
|
||||
self._audio_collected_event.set()
|
||||
|
||||
async def start_detection(self, engine: PipecatEngine):
|
||||
"""Start voicemail detection process."""
|
||||
logger.info("Starting voicemail detection")
|
||||
self.is_detecting = True
|
||||
self._is_cancelled = False
|
||||
self._engine = engine
|
||||
self._audio_collected_event.clear()
|
||||
|
||||
# Start detection in background
|
||||
self._detection_task = asyncio.create_task(self._run_detection_with_timeout())
|
||||
|
||||
async def stop_detection(self):
|
||||
"""Stop detection immediately (called on disconnect)."""
|
||||
logger.info("Stopping voicemail detection due to disconnect")
|
||||
self._is_cancelled = True
|
||||
self.is_detecting = False
|
||||
|
||||
# Set the event to unblock any waiting tasks
|
||||
self._audio_collected_event.set()
|
||||
|
||||
# Cancel ongoing detection task
|
||||
if self._detection_task and not self._detection_task.done():
|
||||
self._detection_task.cancel()
|
||||
|
||||
# Clear audio buffer
|
||||
self.audio_buffer.clear()
|
||||
|
||||
# Wait for tasks to complete cancellation
|
||||
if self._detection_task:
|
||||
try:
|
||||
await self._detection_task
|
||||
except asyncio.CancelledError:
|
||||
pass
|
||||
|
||||
async def _run_detection_with_timeout(self):
|
||||
"""Run detection with proper timeout and cancellation handling."""
|
||||
try:
|
||||
# Wait for audio collection or cancellation directly
|
||||
await self._wait_for_audio_collection()
|
||||
|
||||
# Check if cancelled during collection
|
||||
if self._is_cancelled:
|
||||
logger.info("Detection cancelled during audio collection")
|
||||
return
|
||||
|
||||
# Process detection
|
||||
await self._process_detection()
|
||||
|
||||
except asyncio.CancelledError:
|
||||
logger.info("Voicemail detection task cancelled")
|
||||
except Exception as e:
|
||||
logger.error(f"Error in voicemail detection: {e}")
|
||||
finally:
|
||||
self.is_detecting = False
|
||||
|
||||
async def _wait_for_audio_collection(self):
|
||||
"""Wait for audio buffer to fill or timeout."""
|
||||
try:
|
||||
# Wait for either audio collection completion or timeout
|
||||
await asyncio.wait_for(
|
||||
self._audio_collected_event.wait(),
|
||||
timeout=self.detection_duration + 2.0,
|
||||
)
|
||||
|
||||
if not self._is_cancelled:
|
||||
current_duration = self._current_duration_seconds()
|
||||
logger.info(
|
||||
f"Collected {current_duration:.1f}s of audio for voicemail detection (sample rate: {self._sample_rate}Hz)"
|
||||
)
|
||||
except asyncio.TimeoutError:
|
||||
if not self._is_cancelled:
|
||||
current_duration = self._current_duration_seconds()
|
||||
logger.warning("Audio collection timeout exceeded")
|
||||
logger.info(
|
||||
f"Proceeding with {current_duration:.1f}s of audio (sample rate: {self._sample_rate}Hz)"
|
||||
)
|
||||
|
||||
async def _process_detection(self):
|
||||
"""Process the collected audio to detect voicemail."""
|
||||
if not self.audio_buffer or not self._engine:
|
||||
logger.warning("No audio buffer or engine available for detection")
|
||||
return
|
||||
|
||||
try:
|
||||
# Convert PCM to WAV once for both transcription and storage
|
||||
wav_data = self._create_wav_from_pcm(bytes(self.audio_buffer))
|
||||
|
||||
# Transcribe audio
|
||||
logger.info("Transcribing audio for voicemail detection")
|
||||
transcript = await self._transcribe_audio(wav_data)
|
||||
|
||||
if not transcript:
|
||||
logger.warning("No transcript obtained from audio")
|
||||
|
||||
# Still upload the raw recording so data pipeline has it
|
||||
if self.workflow_run_id:
|
||||
await self._save_voicemail_audio(wav_data, 0.0, False)
|
||||
|
||||
return
|
||||
|
||||
logger.info(
|
||||
f"Voicemail detection transcript obtained: {transcript[:100]}..."
|
||||
)
|
||||
|
||||
# Analyze transcript
|
||||
result = await self._analyze_transcript(transcript)
|
||||
|
||||
# Extract common fields
|
||||
confidence = result.get("confidence", 0.0)
|
||||
reasoning = result.get("reasoning", "No reasoning provided")
|
||||
|
||||
# Save voicemail audio to S3 once for data pipeline (include duration in filename)
|
||||
s3_path = None
|
||||
if self.workflow_run_id:
|
||||
s3_path = await self._save_voicemail_audio(
|
||||
wav_data, confidence, result.get("is_voicemail")
|
||||
)
|
||||
|
||||
# Take action based on result
|
||||
if result.get("is_voicemail", False):
|
||||
logger.info(
|
||||
f"Voicemail detected with confidence {confidence}: {reasoning}"
|
||||
)
|
||||
|
||||
# Update workflow run with voicemail tags
|
||||
if self.workflow_run_id:
|
||||
# Fetch the workflow run from database
|
||||
workflow_run = await db_client.get_workflow_run_by_id(
|
||||
self.workflow_run_id
|
||||
)
|
||||
if workflow_run:
|
||||
call_tags = workflow_run.gathered_context.get("call_tags", [])
|
||||
call_tags.extend(["voicemail_detected", "not_connected"])
|
||||
|
||||
await db_client.update_workflow_run(
|
||||
run_id=workflow_run.id,
|
||||
gathered_context={
|
||||
"call_tags": call_tags,
|
||||
"voicemail_transcript": transcript,
|
||||
"voicemail_confidence": confidence,
|
||||
},
|
||||
)
|
||||
|
||||
# Send end task frame with metadata (including optional S3 path)
|
||||
await self._engine.send_end_task_frame(
|
||||
reason=EndTaskReason.VOICEMAIL_DETECTED.value,
|
||||
abort_immediately=True,
|
||||
)
|
||||
else:
|
||||
logger.info("No voicemail detected, continuing normal conversation")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error processing voicemail detection: {e}")
|
||||
|
||||
async def _transcribe_audio(self, wav_data: bytes) -> str:
|
||||
"""Transcribe audio using OpenAI API directly.
|
||||
|
||||
Args:
|
||||
wav_data: WAV formatted audio data
|
||||
"""
|
||||
client = AsyncOpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
|
||||
|
||||
# Direct API call - no pipeline involvement
|
||||
response = await client.audio.transcriptions.create(
|
||||
file=("audio.wav", wav_data, "audio/wav"),
|
||||
model="whisper-1", # Using whisper-1 as it's more stable for transcription
|
||||
language="en",
|
||||
temperature=0.0,
|
||||
)
|
||||
|
||||
return response.text.strip()
|
||||
|
||||
def _create_wav_from_pcm(self, pcm_data: bytes) -> bytes:
|
||||
"""Convert raw PCM data to WAV format."""
|
||||
wav_buffer = io.BytesIO()
|
||||
with wave.open(wav_buffer, "wb") as wav_file:
|
||||
wav_file.setnchannels(1) # Mono
|
||||
wav_file.setsampwidth(2) # 16-bit
|
||||
wav_file.setframerate(self._sample_rate)
|
||||
wav_file.writeframes(pcm_data)
|
||||
|
||||
wav_buffer.seek(0)
|
||||
return wav_buffer.read()
|
||||
|
||||
async def _analyze_transcript(self, transcript: str) -> dict:
|
||||
"""Analyze transcript using independent OpenAI client."""
|
||||
# Capture the current turn context for proper span nesting
|
||||
parent_context = get_current_turn_context()
|
||||
|
||||
client = AsyncOpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
|
||||
|
||||
langfuse_prompt = None
|
||||
try:
|
||||
langfuse_prompt = self._langfuse_client.get_prompt(
|
||||
"production/voicemail_detection"
|
||||
)
|
||||
prompt = langfuse_prompt.compile(transcript=transcript)
|
||||
except Exception as e:
|
||||
logger.warning(f"Error getting prompt from Langfuse: {e}")
|
||||
prompt = DEFAULT_VOICEMAIL_PROMPT.replace("{transcript}", transcript)
|
||||
|
||||
messages = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": prompt,
|
||||
}
|
||||
]
|
||||
|
||||
# When we have a parent OpenTelemetry context, we need to activate it
|
||||
# so that Langfuse's OTEL tracer will automatically pick it up
|
||||
if parent_context and is_tracing_enabled():
|
||||
# Activate the parent context for this scope
|
||||
token = otel_context.attach(parent_context)
|
||||
try:
|
||||
# Start Langfuse generation - it will automatically use the active OTEL context
|
||||
langfuse_generation = None
|
||||
try:
|
||||
langfuse_generation = self._langfuse_client.start_generation(
|
||||
name="voicemail_detection",
|
||||
model="gpt-4o",
|
||||
input=messages,
|
||||
metadata={
|
||||
"temperature": 0.0,
|
||||
"detection_duration": self.detection_duration,
|
||||
"transcript_length": len(transcript),
|
||||
},
|
||||
prompt=langfuse_prompt,
|
||||
)
|
||||
except Exception as e:
|
||||
logger.warning(f"Error starting Langfuse generation: {e}")
|
||||
|
||||
# Direct API call
|
||||
response = await client.chat.completions.create(
|
||||
model="gpt-4o",
|
||||
messages=messages,
|
||||
temperature=0.0,
|
||||
response_format={"type": "json_object"},
|
||||
)
|
||||
|
||||
llm_response = response.choices[0].message.content
|
||||
|
||||
# Update and end Langfuse generation
|
||||
if langfuse_generation:
|
||||
try:
|
||||
langfuse_generation.update(
|
||||
output=llm_response,
|
||||
usage_details={
|
||||
"prompt_tokens": response.usage.prompt_tokens
|
||||
if response.usage
|
||||
else 0,
|
||||
"completion_tokens": response.usage.completion_tokens
|
||||
if response.usage
|
||||
else 0,
|
||||
"total_tokens": response.usage.total_tokens
|
||||
if response.usage
|
||||
else 0,
|
||||
},
|
||||
)
|
||||
langfuse_generation.end()
|
||||
except Exception as e:
|
||||
logger.warning(f"Error updating Langfuse generation: {e}")
|
||||
finally:
|
||||
# Detach the context
|
||||
otel_context.detach(token)
|
||||
else:
|
||||
# No parent context or tracing disabled - just make the API call
|
||||
response = await client.chat.completions.create(
|
||||
model="gpt-4o",
|
||||
messages=messages,
|
||||
temperature=0.0,
|
||||
response_format={"type": "json_object"},
|
||||
)
|
||||
llm_response = response.choices[0].message.content
|
||||
|
||||
# Parse response
|
||||
try:
|
||||
return json.loads(llm_response)
|
||||
except json.JSONDecodeError:
|
||||
logger.warning("Invalid JSON response from voicemail detection")
|
||||
return {
|
||||
"is_voicemail": False,
|
||||
"confidence": 0.0,
|
||||
"reasoning": "Invalid response",
|
||||
}
|
||||
|
||||
async def _save_voicemail_audio(
|
||||
self, wav_data: bytes, confidence: float, is_voicemail: bool
|
||||
) -> Optional[str]:
|
||||
"""Save voicemail audio to temp file and enqueue task to upload to S3.
|
||||
|
||||
Args:
|
||||
wav_data: WAV formatted audio data
|
||||
confidence: Detection confidence score
|
||||
is_voicemail: Whether it was detected as voicemail
|
||||
|
||||
Returns:
|
||||
The expected S3 object key (bucket path). The actual upload happens asynchronously.
|
||||
"""
|
||||
try:
|
||||
# Create filename with prediction, confidence and duration
|
||||
duration_seconds = self._current_duration_seconds()
|
||||
prediction = "voicemail" if is_voicemail else "not_voicemail"
|
||||
confidence_int = int(confidence * 100)
|
||||
duration_int = int(duration_seconds)
|
||||
s3_key = f"voicemail_detections/{self.workflow_run_id}_{prediction}_{confidence_int}_{duration_int}.wav"
|
||||
|
||||
# Write WAV data to temp file - DO NOT delete it here, the async task will handle cleanup
|
||||
with tempfile.NamedTemporaryFile(
|
||||
suffix=".wav",
|
||||
delete=False, # Important: don't delete immediately
|
||||
prefix=f"voicemail_{self.workflow_run_id}_",
|
||||
) as tmp_file:
|
||||
tmp_file.write(wav_data)
|
||||
tmp_file.flush()
|
||||
temp_file_path = tmp_file.name
|
||||
|
||||
logger.info(f"Saved voicemail audio to temp file: {temp_file_path}")
|
||||
|
||||
# Enqueue async task to upload to S3
|
||||
await enqueue_job(
|
||||
FunctionNames.UPLOAD_VOICEMAIL_AUDIO_TO_S3,
|
||||
self.workflow_run_id,
|
||||
temp_file_path,
|
||||
s3_key,
|
||||
)
|
||||
|
||||
logger.info(f"Enqueued voicemail audio upload task for: {s3_key}")
|
||||
return s3_key
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to save voicemail audio: {e}")
|
||||
# Clean up temp file if task enqueue failed
|
||||
if "temp_file_path" in locals() and os.path.exists(temp_file_path):
|
||||
try:
|
||||
os.remove(temp_file_path)
|
||||
except Exception as cleanup_error:
|
||||
logger.warning(
|
||||
f"Failed to cleanup temp file after error: {cleanup_error}"
|
||||
)
|
||||
return None
|
||||
Loading…
Add table
Add a link
Reference in a new issue