diff --git a/api/Dockerfile b/api/Dockerfile index faddf34..d8afae0 100644 --- a/api/Dockerfile +++ b/api/Dockerfile @@ -13,6 +13,11 @@ RUN apt-get update && apt-get install -y \ # Copy and install requirements COPY api/requirements.txt . +# Install CPU-only PyTorch FIRST to prevent CUDA/NVIDIA dependencies +# This satisfies torch dependency before other packages try to pull GPU version +RUN pip install --user --no-cache-dir torch --index-url https://download.pytorch.org/whl/cpu && \ + rm -rf /root/.cache/pip + # Install dependencies to user directory for easy copying RUN pip install --user --no-cache-dir -r requirements.txt && \ # Clean up pip cache after installation diff --git a/api/services/pipecat/run_pipeline.py b/api/services/pipecat/run_pipeline.py index 1505179..00744f0 100644 --- a/api/services/pipecat/run_pipeline.py +++ b/api/services/pipecat/run_pipeline.py @@ -44,6 +44,7 @@ from api.services.telephony.stasis_rtp_connection import StasisRTPConnection from api.services.workflow.dto import ReactFlowDTO from api.services.workflow.pipecat_engine import PipecatEngine from api.services.workflow.workflow import WorkflowGraph +from pipecat.audio.turn.smart_turn.base_smart_turn import SmartTurnParams from pipecat.audio.turn.smart_turn.local_smart_turn_v3 import LocalSmartTurnAnalyzerV3 from pipecat.extensions.voicemail.voicemail_detector import VoicemailDetector from pipecat.pipeline.base_task import PipelineTaskParams @@ -66,6 +67,7 @@ from pipecat.turns.user_start.vad_user_turn_start_strategy import ( ) from pipecat.turns.user_stop import ( ExternalUserTurnStopStrategy, + TranscriptionUserTurnStopStrategy, TurnAnalyzerUserTurnStopStrategy, ) from pipecat.turns.user_turn_strategies import UserTurnStrategies @@ -453,6 +455,8 @@ async def _run_pipeline( # Extract configurations from workflow configurations max_call_duration_seconds = 300 # Default 5 minutes max_user_idle_timeout = 10.0 # Default 10 seconds + smart_turn_stop_secs = 2.0 # Default 2 seconds for incomplete turn timeout + turn_stop_strategy = "transcription" # Default to transcription-based detection keyterms = None # Dictionary words for STT boosting if workflow.workflow_configurations: @@ -468,6 +472,16 @@ async def _run_pipeline( "max_user_idle_timeout" ] + # Use workflow-specific smart turn stop timeout if provided + if "smart_turn_stop_secs" in workflow.workflow_configurations: + smart_turn_stop_secs = workflow.workflow_configurations[ + "smart_turn_stop_secs" + ] + + # Use workflow-specific turn stop strategy if provided + if "turn_stop_strategy" in workflow.workflow_configurations: + turn_stop_strategy = workflow.workflow_configurations["turn_stop_strategy"] + # Extract dictionary words and convert to keyterms list if "dictionary" in workflow.workflow_configurations: dictionary = workflow.workflow_configurations["dictionary"] @@ -551,9 +565,9 @@ async def _run_pipeline( correct_aggregation_callback=engine.create_aggregation_correction_callback(), ) - # Configure turn strategies based on STT provider and model + # Configure turn strategies based on STT provider, model, and workflow configuration # Deepgram Flux uses external turn detection (VAD + External start/stop) - # Other models use transcription-based turn detection with smart turn analyzer + # Other models use configurable turn detection strategy is_deepgram_flux = ( user_config.stt.provider == ServiceProviders.DEEPGRAM.value and user_config.stt.model == "flux-general-en" @@ -564,15 +578,23 @@ async def _run_pipeline( start=[VADUserTurnStartStrategy(), TranscriptionUserTurnStartStrategy()], stop=[ExternalUserTurnStopStrategy()], ) - else: + elif turn_stop_strategy == "turn_analyzer": + # Smart Turn Analyzer: best for longer responses with natural pauses + smart_turn_params = SmartTurnParams(stop_secs=smart_turn_stop_secs) user_turn_strategies = UserTurnStrategies( start=[VADUserTurnStartStrategy(), TranscriptionUserTurnStartStrategy()], stop=[ TurnAnalyzerUserTurnStopStrategy( - turn_analyzer=LocalSmartTurnAnalyzerV3() + turn_analyzer=LocalSmartTurnAnalyzerV3(params=smart_turn_params) ) ], ) + else: + # Transcription-based (default): best for short 1-2 word responses + user_turn_strategies = UserTurnStrategies( + start=[VADUserTurnStartStrategy(), TranscriptionUserTurnStartStrategy()], + stop=[TranscriptionUserTurnStopStrategy()], + ) # Create user mute strategies # - CallbackUserMuteStrategy: mutes based on engine's _mute_pipeline state diff --git a/api/tasks/knowledge_base_processing.py b/api/tasks/knowledge_base_processing.py index 9dc925b..e1a4cea 100644 --- a/api/tasks/knowledge_base_processing.py +++ b/api/tasks/knowledge_base_processing.py @@ -125,15 +125,11 @@ async def process_knowledge_base_document( embeddings_api_key = None embeddings_model = None if document.created_by: - user_config = await db_client.get_user_configurations( - document.created_by - ) + user_config = await db_client.get_user_configurations(document.created_by) if user_config.embeddings: embeddings_api_key = user_config.embeddings.api_key embeddings_model = user_config.embeddings.model - logger.info( - f"Using user embeddings config: model={embeddings_model}" - ) + logger.info(f"Using user embeddings config: model={embeddings_model}") # Check if API key is configured if not embeddings_api_key: diff --git a/ui/src/app/workflow/[workflowId]/components/ConfigurationsDialog.tsx b/ui/src/app/workflow/[workflowId]/components/ConfigurationsDialog.tsx index 68a7fdc..92bd454 100644 --- a/ui/src/app/workflow/[workflowId]/components/ConfigurationsDialog.tsx +++ b/ui/src/app/workflow/[workflowId]/components/ConfigurationsDialog.tsx @@ -4,8 +4,9 @@ import { Button } from "@/components/ui/button"; import { Dialog, DialogContent, DialogFooter, DialogHeader, DialogTitle } from "@/components/ui/dialog"; import { Input } from "@/components/ui/input"; import { Label } from "@/components/ui/label"; +import { Select, SelectContent, SelectItem, SelectTrigger, SelectValue } from "@/components/ui/select"; import { Switch } from "@/components/ui/switch"; -import { AmbientNoiseConfiguration, VADConfiguration, WorkflowConfigurations } from "@/types/workflow-configurations"; +import { AmbientNoiseConfiguration, TurnStopStrategy, WorkflowConfigurations } from "@/types/workflow-configurations"; interface ConfigurationsDialogProps { open: boolean; @@ -15,13 +16,6 @@ interface ConfigurationsDialogProps { onSave: (configurations: WorkflowConfigurations, workflowName: string) => Promise; } -const DEFAULT_VAD_CONFIG: VADConfiguration = { - confidence: 0.7, - start_seconds: 0.4, - stop_seconds: 0.8, - minimum_volume: 0.6, -}; - const DEFAULT_AMBIENT_NOISE_CONFIG: AmbientNoiseConfiguration = { enabled: false, volume: 0.3, @@ -35,9 +29,6 @@ export const ConfigurationsDialog = ({ onSave }: ConfigurationsDialogProps) => { const [name, setName] = useState(workflowName); - const [vadConfig, setVadConfig] = useState( - workflowConfigurations?.vad_configuration || DEFAULT_VAD_CONFIG - ); const [ambientNoiseConfig, setAmbientNoiseConfig] = useState( workflowConfigurations?.ambient_noise_configuration || DEFAULT_AMBIENT_NOISE_CONFIG ); @@ -47,16 +38,23 @@ export const ConfigurationsDialog = ({ const [maxUserIdleTimeout, setMaxUserIdleTimeout] = useState( workflowConfigurations?.max_user_idle_timeout || 10 // Default 10 seconds ); + const [smartTurnStopSecs, setSmartTurnStopSecs] = useState( + workflowConfigurations?.smart_turn_stop_secs || 2 // Default 2 seconds + ); + const [turnStopStrategy, setTurnStopStrategy] = useState( + workflowConfigurations?.turn_stop_strategy || 'transcription' + ); const [isSaving, setIsSaving] = useState(false); const handleSave = async () => { setIsSaving(true); try { await onSave({ - vad_configuration: vadConfig, ambient_noise_configuration: ambientNoiseConfig, max_call_duration: maxCallDuration, - max_user_idle_timeout: maxUserIdleTimeout + max_user_idle_timeout: maxUserIdleTimeout, + smart_turn_stop_secs: smartTurnStopSecs, + turn_stop_strategy: turnStopStrategy }, name); onOpenChange(false); } catch (error) { @@ -70,23 +68,14 @@ export const ConfigurationsDialog = ({ useEffect(() => { if (open) { setName(workflowName); - setVadConfig(workflowConfigurations?.vad_configuration || DEFAULT_VAD_CONFIG); setAmbientNoiseConfig(workflowConfigurations?.ambient_noise_configuration || DEFAULT_AMBIENT_NOISE_CONFIG); setMaxCallDuration(workflowConfigurations?.max_call_duration || 600); setMaxUserIdleTimeout(workflowConfigurations?.max_user_idle_timeout || 10); + setSmartTurnStopSecs(workflowConfigurations?.smart_turn_stop_secs || 2); + setTurnStopStrategy(workflowConfigurations?.turn_stop_strategy || 'transcription'); } }, [open, workflowName, workflowConfigurations]); - const handleVadChange = (field: keyof VADConfiguration, value: string) => { - const numValue = parseFloat(value); - if (!isNaN(numValue)) { - setVadConfig(prev => ({ - ...prev, - [field]: numValue - })); - } - }; - return ( @@ -117,76 +106,6 @@ export const ConfigurationsDialog = ({ - {/* Voice Activity Detection Section */} -
-
-

Voice Activity Detection

-

- Hyperparameters to set for voice activity detection. Already configured with defaults. -

-
- -
-
- - handleVadChange('confidence', e.target.value)} - /> -
- -
- - handleVadChange('start_seconds', e.target.value)} - /> -
- -
- - handleVadChange('stop_seconds', e.target.value)} - /> -
- -
- - handleVadChange('minimum_volume', e.target.value)} - /> -
-
-
- {/* Ambient Noise Section */}
@@ -234,6 +153,68 @@ export const ConfigurationsDialog = ({
+ {/* Turn Detection Section */} +
+
+

Turn Detection

+

+ Configure how the agent detects when the user has finished speaking. +

+
+ +
+ + +

+ {turnStopStrategy === 'transcription' + ? "Best for short responses (1-2 word statements). Ends turn when transcription indicates completion." + : "Best for longer responses with natural pauses. Uses ML model to detect end of turn."} +

+
+ + {turnStopStrategy === 'turn_analyzer' && ( +
+ + { + const value = parseFloat(e.target.value); + if (!isNaN(value) && value >= 0.5) { + setSmartTurnStopSecs(value); + } + }} + /> +

+ Max silence duration before ending an incomplete turn. Default: 2 seconds +

+
+ )} +
+ {/* Call Management Section */}
diff --git a/ui/src/types/workflow-configurations.ts b/ui/src/types/workflow-configurations.ts index 93a3c05..9f44063 100644 --- a/ui/src/types/workflow-configurations.ts +++ b/ui/src/types/workflow-configurations.ts @@ -10,27 +10,27 @@ export interface AmbientNoiseConfiguration { volume: number; } +export type TurnStopStrategy = 'transcription' | 'turn_analyzer'; + export interface WorkflowConfigurations { - vad_configuration: VADConfiguration; + vad_configuration?: VADConfiguration; ambient_noise_configuration: AmbientNoiseConfiguration; max_call_duration: number; // Maximum call duration in seconds max_user_idle_timeout: number; // Maximum user idle time in seconds + smart_turn_stop_secs: number; // Timeout in seconds for incomplete turn detection + turn_stop_strategy: TurnStopStrategy; // Strategy for detecting end of user turn dictionary?: string; // Comma-separated words for voice agent to listen for [key: string]: unknown; // Allow additional properties for future configurations } export const DEFAULT_WORKFLOW_CONFIGURATIONS: WorkflowConfigurations = { - vad_configuration: { - confidence: 0.7, - start_seconds: 0.4, - stop_seconds: 0.8, - minimum_volume: 0.6 - }, ambient_noise_configuration: { enabled: false, volume: 0.3 }, max_call_duration: 600, // 10 minutes max_user_idle_timeout: 10, // 10 seconds + smart_turn_stop_secs: 2, // 2 seconds + turn_stop_strategy: 'transcription', // Default to transcription-based detection dictionary: '' };