rowboat/apps/experimental/twilio_handler/app.py
2025-04-07 23:53:17 +05:30

633 lines
No EOL
25 KiB
Python

from flask import Flask, request, jsonify, Response
from twilio.twiml.voice_response import VoiceResponse, Gather
import os
import logging
import uuid
from typing import Dict, Any, Optional
import json
from time import time
from rowboat.schema import SystemMessage, UserMessage, ApiMessage
import elevenlabs
# Load environment variables
from load_env import load_environment
load_environment()
from twilio_api import process_conversation_turn
# Import MongoDB utility functions
from util import (
get_call_state,
save_call_state,
delete_call_state,
get_mongodb_status,
get_twilio_config,
CallState
)
Message = SystemMessage | UserMessage
ELEVENLABS_API_KEY = os.environ.get("ELEVENLABS_API_KEY")
elevenlabs_client = elevenlabs.ElevenLabs(api_key=ELEVENLABS_API_KEY)
app = Flask(__name__)
# Configure logging to stdout for Docker compatibility
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
handlers=[logging.StreamHandler()] # Send logs to stdout
)
logger = logging.getLogger(__name__)
# Local in-memory cache of call state (temporary cache only - not primary storage)
# MongoDB is the primary storage for state across multiple instances
active_calls = {}
# TTS configuration
TTS_VOICE = "Markus - Mature and Chill"
TTS_MODEL = "eleven_flash_v2_5"
@app.route('/inbound', methods=['POST'])
def handle_inbound_call():
"""Handle incoming calls to Twilio numbers configured for RowBoat"""
try:
# Log the entire request for debugging
logger.info(f"Received inbound call request: {request.values}")
# Get the Twilio phone number that received the call
to_number = request.values.get('To')
call_sid = request.values.get('CallSid')
from_number = request.values.get('From')
logger.info(f"Inbound call from {from_number} to {to_number}, CallSid: {call_sid}")
logger.info(f"Raw To number value: '{to_number}', Type: {type(to_number)}")
# Get configuration ONLY from MongoDB
system_prompt = "You are a helpful assistant. Provide concise and clear answers."
workflow_id = None
project_id = None
# Look up configuration in MongoDB
twilio_config = get_twilio_config(to_number)
if twilio_config:
workflow_id = twilio_config['workflow_id']
project_id = twilio_config['project_id']
system_prompt = twilio_config.get('system_prompt', system_prompt)
logger.info(f"Found MongoDB configuration for {to_number}: project_id={project_id}, workflow_id={workflow_id}")
else:
logger.warning(f"No active configuration found in MongoDB for phone number {to_number}")
if not workflow_id:
# No workflow found - provide error message
logger.error(f"No workflow_id found for inbound call to {to_number}")
response = VoiceResponse()
response.say("I'm sorry, this phone number is not properly configured in our system. Please contact support.", voice='alice')
# Include additional information in TwiML for debugging
response.say(f"Received call to number {to_number}", voice='alice')
response.hangup()
return str(response)
# Initialize call state with stateless API fields
call_state = CallState(
workflow_id=workflow_id,
project_id=project_id,
system_prompt=system_prompt,
conversation_history=[],
messages=[], # For stateless API
state=None, # For stateless API state
turn_count=0,
inbound=True,
to_number=to_number,
created_at=int(time()) # Add timestamp for expiration tracking
)
# Save to MongoDB (primary source of truth)
try:
save_call_state(call_sid, call_state)
logger.info(f"Saved initial call state to MongoDB for inbound call {call_sid}")
except Exception as e:
logger.error(f"Error saving inbound call state to MongoDB: {str(e)}")
raise RuntimeError(f"Failed to save call state to MongoDB: {str(e)}")
# Only use memory storage as a temporary cache
# The service that handles the next request might be different
active_calls[call_sid] = call_state
logger.info(f"Initialized call state for {call_sid}, proceeding to handle_call")
# Create a direct response instead of redirecting
return handle_call(call_sid, workflow_id, project_id)
except Exception as e:
# Log the full error with traceback
import traceback
logger.error(f"Error in handle_inbound_call: {str(e)}")
logger.error(traceback.format_exc())
# Return a basic TwiML response so Twilio doesn't get a 500 error
response = VoiceResponse()
response.say("I'm sorry, we encountered an error processing your call. Please try again later.", voice='alice')
response.hangup()
return str(response)
@app.route('/twiml', methods=['POST'])
def handle_twiml_call():
"""TwiML endpoint for outbound call handling"""
call_sid = request.values.get('CallSid')
# Get call state to retrieve workflow_id and project_id
call_state = get_call_state(call_sid)
if call_state:
workflow_id = call_state.get('workflow_id')
project_id = call_state.get('project_id')
return handle_call(call_sid, workflow_id, project_id)
else:
# No call state found - error response
response = VoiceResponse()
response.say("I'm sorry, your call session has expired. Please try again.", voice='alice')
response.hangup()
return str(response)
def handle_call(call_sid, workflow_id, project_id=None):
"""Common handler for both inbound and outbound calls"""
try:
logger.info(f"handle_call: processing call {call_sid} with workflow {workflow_id}, project_id {project_id}")
# Get or initialize call state, first from MongoDB
call_state = None
try:
# Query MongoDB for the call state
call_state = get_call_state(call_sid)
if call_state:
logger.info(f"Loaded and restored call state from MongoDB for {call_sid}")
except Exception as e:
logger.error(f"Error retrieving MongoDB state for {call_sid}: {str(e)}")
call_state = None
# Try in-memory cache as fallback (temporary local cache)
if call_state is None and call_sid in active_calls:
call_state = active_calls.get(call_sid)
logger.info(f"Using in-memory cache for call state of {call_sid}")
# Initialize new state if needed
if call_state is None and workflow_id:
call_state = CallState(
workflow_id=workflow_id,
project_id=project_id,
system_prompt="You are a helpful assistant. Provide concise and clear answers.",
conversation_history=[],
messages=[], # For stateless API
state=None, # For stateless API state
turn_count=0,
inbound=False, # Default for outbound calls
to_number="", # This will be set properly for inbound calls
created_at=int(time()), # Add timestamp for expiration tracking
last_transcription=""
)
# Save to MongoDB (primary source of truth)
try:
save_call_state(call_sid, call_state)
logger.info(f"Initialized and saved new call state to MongoDB for {call_sid}")
except Exception as e:
logger.error(f"Error saving new call state to MongoDB: {str(e)}")
raise RuntimeError(f"Failed to save call state to MongoDB: {str(e)}")
# Only use memory as temporary cache for this request
active_calls[call_sid] = call_state
logger.info(f"Initialized new call state for {call_sid}")
logger.info(f"Using call state: {call_state}")
# Create TwiML response
response = VoiceResponse()
# Check if this is a new call (no turns yet)
if call_state.get('turn_count', 0) == 0:
logger.info("First turn: generating AI greeting using an empty user input...")
# Generate greeting by calling process_conversation_turn with empty user input
try:
ai_greeting, updated_messages, updated_state = process_conversation_turn(
user_input="", # empty to signal "give me your greeting"
workflow_id=call_state['workflow_id'],
system_prompt=call_state['system_prompt'],
previous_messages=[],
previous_state=None,
project_id=call_state.get('project_id')
)
except Exception as e:
logger.error(f"Error generating AI greeting: {str(e)}")
ai_greeting = "Hello, I encountered an issue creating a greeting. How can I help you?"
# Fallback: no changes to updated_messages/updated_state
updated_messages = []
updated_state = None
# Update call_state with AI greeting
call_state['messages'] = updated_messages
call_state['state'] = updated_state
call_state['conversation_history'].append({
'user': "", # empty user
'assistant': ai_greeting
})
call_state['turn_count'] = 1
# Save changes to MongoDB
try:
save_call_state(call_sid, call_state)
logger.info(f"Saved greeting state to MongoDB for {call_sid}")
except Exception as e:
logger.error(f"Error saving greeting state to MongoDB: {str(e)}")
raise RuntimeError(f"Failed to save greeting state to MongoDB: {str(e)}")
active_calls[call_sid] = call_state
# Play the greeting via streaming audio
unique_id = str(uuid.uuid4())
audio_url = f"/stream-audio/{call_sid}/greeting/{unique_id}"
logger.info(f"Will stream greeting from {audio_url}")
response.play(audio_url)
# Gather user input next
gather = Gather(
input='speech',
action=f'/process_speech?call_sid={call_sid}',
speech_timeout='auto',
language='en-US',
enhanced=True,
speechModel='phone_call'
)
response.append(gather)
response.redirect('/twiml')
logger.info(f"Returning response: {str(response)}")
return str(response)
except Exception as e:
# Log the full error with traceback
import traceback
logger.error(f"Error in handle_call: {str(e)}")
logger.error(traceback.format_exc())
# Return a basic TwiML response
response = VoiceResponse()
response.say("I'm sorry, we encountered an error processing your call. Please try again later.", voice='alice')
response.hangup()
return str(response)
@app.route('/process_speech', methods=['POST'])
def process_speech():
"""Process user speech input and generate AI response"""
try:
logger.info(f"Processing speech: {request.values}")
call_sid = request.args.get('call_sid')
# Log all request values for debugging
logger.info(f"FULL REQUEST VALUES: {dict(request.values)}")
logger.info(f"FULL REQUEST ARGS: {dict(request.args)}")
# Get the speech result directly from Twilio
# We're now relying on Twilio's enhanced speech recognition instead of Deepgram
speech_result = request.values.get('SpeechResult')
confidence = request.values.get('Confidence')
logger.info(f"Twilio SpeechResult: {speech_result}")
logger.info(f"Twilio Confidence: {confidence}")
if not call_sid:
logger.warning(f"Missing call_sid: {call_sid}")
response = VoiceResponse()
response.say("I'm sorry, I couldn't process that request.", voice='alice')
response.hangup()
return str(response)
if not speech_result:
logger.warning("No speech result after transcription attempts")
response = VoiceResponse()
response.say("I'm sorry, I didn't catch what you said. Could you please try again?", voice='alice')
# Gather user input again
gather = Gather(
input='speech',
action=f'/process_speech?call_sid={call_sid}',
speech_timeout='auto',
language='en-US',
enhanced=True,
speechModel='phone_call'
)
response.append(gather)
# Redirect to twiml endpoint which will get call state from MongoDB
response.redirect('/twiml')
return str(response)
# Load call state from MongoDB (primary source of truth)
call_state = None
try:
call_state = get_call_state(call_sid)
if call_state:
logger.info(f"Loaded call state from MongoDB for speech processing: {call_sid}")
except Exception as e:
logger.error(f"Error retrieving MongoDB state for speech processing: {str(e)}")
call_state = None
# Try memory cache as fallback
if call_state is None and call_sid in active_calls:
call_state = active_calls[call_sid]
logger.info(f"Using in-memory state for speech processing: {call_sid}")
# Check if we have valid state
if not call_state:
logger.warning(f"No call state found for speech processing: {call_sid}")
response = VoiceResponse()
response.say("I'm sorry, your call session has expired. Please call back.", voice='alice')
response.hangup()
return str(response)
# Extract key information
workflow_id = call_state.get('workflow_id')
project_id = call_state.get('project_id')
system_prompt = call_state.get('system_prompt', "You are a helpful assistant.")
# Check if we have a Deepgram transcription stored in the call state
if 'last_transcription' in call_state and call_state['last_transcription']:
deepgram_transcription = call_state['last_transcription']
logger.info(f"Found stored Deepgram transcription: {deepgram_transcription}")
logger.info(f"Comparing with Twilio transcription: {speech_result}")
# Use the Deepgram transcription instead of Twilio's
speech_result = deepgram_transcription
# Remove it so we don't use it again
del call_state['last_transcription']
logger.info(f"Using Deepgram transcription instead")
# Log final user input that will be used
logger.info(f"Final user input: {speech_result}")
# Process with RowBoat agent
try:
# Clean up the speech result if needed
if speech_result:
# Remove any common filler words or fix typical transcription issues
import re
# Convert to lowercase for easier pattern matching
cleaned_input = speech_result.lower()
# Remove filler words that might be at the beginning
cleaned_input = re.sub(r'^(um|uh|like|so|okay|well)\s+', '', cleaned_input)
# Capitalize first letter for better appearance
if cleaned_input:
speech_result = cleaned_input[0].upper() + cleaned_input[1:]
logger.info(f"Sending to RowBoat: '{speech_result}'")
# Get previous messages and state from call state
previous_messages = call_state.get('messages', [])
previous_state = call_state.get('state')
# Process with stateless API
ai_response, updated_messages, updated_state = process_conversation_turn(
user_input=speech_result,
workflow_id=workflow_id,
system_prompt=system_prompt,
previous_messages=previous_messages,
previous_state=previous_state,
project_id=project_id
)
# Update the messages and state in call state
call_state['messages'] = updated_messages
call_state['state'] = updated_state
logger.info(f"RowBoat response: {ai_response}")
except Exception as e:
logger.error(f"Error processing with RowBoat: {str(e)}")
ai_response = "I'm sorry, I encountered an issue processing your request. Could you please try again?"
# Conversation history is updated in the streaming response section below
# Create TwiML response
response = VoiceResponse()
# Use streaming audio for the response
logger.info("Setting up response streaming with ElevenLabs")
try:
# Store the AI response in conversation history first
# (The stream-audio endpoint will read it from here)
# Update conversation history (do this before streaming so the endpoint can access it)
call_state['conversation_history'].append({
'user': speech_result,
'assistant': ai_response
})
call_state['turn_count'] += 1
# Save to MongoDB (primary source of truth)
try:
save_call_state(call_sid, call_state)
logger.info(f"Saved response state to MongoDB for {call_sid}")
except Exception as e:
logger.error(f"Error saving response state to MongoDB: {str(e)}")
raise RuntimeError(f"Failed to save response state to MongoDB: {str(e)}")
# Update local memory cache
active_calls[call_sid] = call_state
# Generate a unique ID to prevent caching
unique_id = str(uuid.uuid4())
# Use a relative URL - Twilio will use the same host as the webhook
audio_url = f"/stream-audio/{call_sid}/response/{unique_id}"
logger.info(f"Streaming response from relative URL: {audio_url}")
# Play the response via streaming
response.play(audio_url)
except Exception as e:
logger.error(f"Error with audio streaming for response: {str(e)}")
import traceback
logger.error(traceback.format_exc())
# Fallback to Twilio TTS
response.say(ai_response, voice='alice')
# Gather next user input with enhanced speech recognition
gather = Gather(
input='speech',
action=f'/process_speech?call_sid={call_sid}',
speech_timeout='auto',
language='en-US',
enhanced=True, # Enable enhanced speech recognition
speechModel='phone_call' # Optimize for phone calls
)
response.append(gather)
# If no input detected, redirect to twiml endpoint
# Call state will be retrieved from MongoDB
response.redirect('/twiml')
logger.info(f"Returning TwiML response for speech processing")
return str(response)
except Exception as e:
# Log the full error with traceback
import traceback
logger.error(f"Error in process_speech: {str(e)}")
logger.error(traceback.format_exc())
# Return a basic TwiML response
response = VoiceResponse()
response.say("I'm sorry, we encountered an error processing your speech. Please try again.", voice='alice')
response.gather(
input='speech',
action=f'/process_speech?call_sid={request.args.get("call_sid")}',
speech_timeout='auto'
)
return str(response)
@app.route('/stream-audio/<call_sid>/<text_type>/<unique_id>', methods=['GET'])
def stream_audio(call_sid, text_type, unique_id):
"""Stream audio directly from ElevenLabs to Twilio without saving to disk"""
try:
logger.info(f"Audio streaming requested for call {call_sid}, type {text_type}")
# Determine what text to synthesize
text_to_speak = ""
if text_type == "greeting" or text_type == "response":
# Get the text from call state (try MongoDB first, then memory)
call_state = None
# Try MongoDB first
try:
call_state = get_call_state(call_sid)
if call_state:
logger.info(f"Loaded call state from MongoDB for streaming: {call_sid}")
except Exception as e:
logger.error(f"Error retrieving MongoDB state for streaming: {str(e)}")
call_state = None
# Fall back to memory if needed
if call_state is None:
if call_sid not in active_calls:
logger.error(f"Call SID not found for streaming: {call_sid}")
return "Call not found", 404
call_state = active_calls[call_sid]
logger.info(f"Using in-memory state for streaming: {call_sid}")
if call_state.get('conversation_history') and len(call_state['conversation_history']) > 0:
# Get the most recent AI response
text_to_speak = call_state['conversation_history'][-1]['assistant']
else:
logger.warning(f"No conversation history found for call {call_sid}")
text_to_speak = "I'm sorry, I don't have a response ready. Could you please repeat?"
else:
# Direct text may be passed as the text_type (for testing)
text_to_speak = text_type
if not text_to_speak:
logger.error("No text to synthesize")
return "No text to synthesize", 400
logger.info(f"Streaming audio for text: {text_to_speak[:50]}...")
def generate():
try:
# Generate and stream the audio directly
audio_stream = elevenlabs_client.generate(
text=text_to_speak,
voice=TTS_VOICE,
model=TTS_MODEL,
output_format="mp3_44100_128"
)
# Stream chunks directly to the response
for chunk in audio_stream:
yield chunk
logger.info(f"Finished streaming audio for call {call_sid}")
except Exception as e:
logger.error(f"Error in audio stream generator: {str(e)}")
import traceback
logger.error(traceback.format_exc())
# Return a streaming response
response = Response(generate(), mimetype='audio/mpeg')
return response
except Exception as e:
logger.error(f"Error setting up audio stream: {str(e)}")
import traceback
logger.error(traceback.format_exc())
return "Error streaming audio", 500
@app.route('/call-status', methods=['POST'])
def call_status_callback():
"""Handle call status callbacks from Twilio"""
call_sid = request.values.get('CallSid')
call_status = request.values.get('CallStatus')
logger.info(f"Call {call_sid} status: {call_status}")
# Clean up resources when call completes
if call_status in ['completed', 'failed', 'busy', 'no-answer', 'canceled']:
# Get call state from MongoDB or memory
call_state = None
# Try to load from MongoDB first
try:
call_state = get_call_state(call_sid)
if call_state:
logger.info(f"Loaded final state from MongoDB for {call_sid}")
except Exception as e:
logger.error(f"Error retrieving final state from MongoDB: {str(e)}")
call_state = None
# Fall back to memory if needed
if call_state is None and call_sid in active_calls:
call_state = active_calls[call_sid]
logger.info(f"Using in-memory state for final call state of {call_sid}")
if call_state:
# Remove from active calls in both memory and MongoDB
if call_sid in active_calls:
del active_calls[call_sid]
logger.info(f"Removed call {call_sid} from active calls memory")
try:
# Remove the document from MongoDB
delete_call_state(call_sid)
logger.info(f"Removed call {call_sid} from MongoDB")
except Exception as e:
logger.error(f"Error removing call state from MongoDB: {str(e)}")
return '', 204
@app.route('/health', methods=['GET'])
def health_check():
"""Simple health check endpoint"""
health_data = {
"status": "healthy",
"active_calls_memory": len(active_calls)
}
# Get MongoDB status
try:
mongodb_status = get_mongodb_status()
health_data["mongodb"] = mongodb_status
health_data["active_calls_mongodb"] = mongodb_status.get("active_calls", 0)
except Exception as e:
health_data["mongodb_error"] = str(e)
health_data["status"] = "degraded"
return jsonify(health_data)
if __name__ == '__main__':
# Log startup information
logger.info(f"Starting Twilio-RowBoat server")
# Remove the explicit run configuration since Flask CLI will handle it
app.run()