mirror of
https://github.com/dograh-hq/dograh.git
synced 2026-06-07 07:55:16 +02:00
make turn detection configurable
This commit is contained in:
parent
2d4a7b49b0
commit
8f67e48d45
5 changed files with 115 additions and 111 deletions
|
|
@ -13,6 +13,11 @@ RUN apt-get update && apt-get install -y \
|
|||
# Copy and install requirements
|
||||
COPY api/requirements.txt .
|
||||
|
||||
# Install CPU-only PyTorch FIRST to prevent CUDA/NVIDIA dependencies
|
||||
# This satisfies torch dependency before other packages try to pull GPU version
|
||||
RUN pip install --user --no-cache-dir torch --index-url https://download.pytorch.org/whl/cpu && \
|
||||
rm -rf /root/.cache/pip
|
||||
|
||||
# Install dependencies to user directory for easy copying
|
||||
RUN pip install --user --no-cache-dir -r requirements.txt && \
|
||||
# Clean up pip cache after installation
|
||||
|
|
|
|||
|
|
@ -44,6 +44,7 @@ from api.services.telephony.stasis_rtp_connection import StasisRTPConnection
|
|||
from api.services.workflow.dto import ReactFlowDTO
|
||||
from api.services.workflow.pipecat_engine import PipecatEngine
|
||||
from api.services.workflow.workflow import WorkflowGraph
|
||||
from pipecat.audio.turn.smart_turn.base_smart_turn import SmartTurnParams
|
||||
from pipecat.audio.turn.smart_turn.local_smart_turn_v3 import LocalSmartTurnAnalyzerV3
|
||||
from pipecat.extensions.voicemail.voicemail_detector import VoicemailDetector
|
||||
from pipecat.pipeline.base_task import PipelineTaskParams
|
||||
|
|
@ -66,6 +67,7 @@ from pipecat.turns.user_start.vad_user_turn_start_strategy import (
|
|||
)
|
||||
from pipecat.turns.user_stop import (
|
||||
ExternalUserTurnStopStrategy,
|
||||
TranscriptionUserTurnStopStrategy,
|
||||
TurnAnalyzerUserTurnStopStrategy,
|
||||
)
|
||||
from pipecat.turns.user_turn_strategies import UserTurnStrategies
|
||||
|
|
@ -453,6 +455,8 @@ async def _run_pipeline(
|
|||
# Extract configurations from workflow configurations
|
||||
max_call_duration_seconds = 300 # Default 5 minutes
|
||||
max_user_idle_timeout = 10.0 # Default 10 seconds
|
||||
smart_turn_stop_secs = 2.0 # Default 2 seconds for incomplete turn timeout
|
||||
turn_stop_strategy = "transcription" # Default to transcription-based detection
|
||||
keyterms = None # Dictionary words for STT boosting
|
||||
|
||||
if workflow.workflow_configurations:
|
||||
|
|
@ -468,6 +472,16 @@ async def _run_pipeline(
|
|||
"max_user_idle_timeout"
|
||||
]
|
||||
|
||||
# Use workflow-specific smart turn stop timeout if provided
|
||||
if "smart_turn_stop_secs" in workflow.workflow_configurations:
|
||||
smart_turn_stop_secs = workflow.workflow_configurations[
|
||||
"smart_turn_stop_secs"
|
||||
]
|
||||
|
||||
# Use workflow-specific turn stop strategy if provided
|
||||
if "turn_stop_strategy" in workflow.workflow_configurations:
|
||||
turn_stop_strategy = workflow.workflow_configurations["turn_stop_strategy"]
|
||||
|
||||
# Extract dictionary words and convert to keyterms list
|
||||
if "dictionary" in workflow.workflow_configurations:
|
||||
dictionary = workflow.workflow_configurations["dictionary"]
|
||||
|
|
@ -551,9 +565,9 @@ async def _run_pipeline(
|
|||
correct_aggregation_callback=engine.create_aggregation_correction_callback(),
|
||||
)
|
||||
|
||||
# Configure turn strategies based on STT provider and model
|
||||
# Configure turn strategies based on STT provider, model, and workflow configuration
|
||||
# Deepgram Flux uses external turn detection (VAD + External start/stop)
|
||||
# Other models use transcription-based turn detection with smart turn analyzer
|
||||
# Other models use configurable turn detection strategy
|
||||
is_deepgram_flux = (
|
||||
user_config.stt.provider == ServiceProviders.DEEPGRAM.value
|
||||
and user_config.stt.model == "flux-general-en"
|
||||
|
|
@ -564,15 +578,23 @@ async def _run_pipeline(
|
|||
start=[VADUserTurnStartStrategy(), TranscriptionUserTurnStartStrategy()],
|
||||
stop=[ExternalUserTurnStopStrategy()],
|
||||
)
|
||||
else:
|
||||
elif turn_stop_strategy == "turn_analyzer":
|
||||
# Smart Turn Analyzer: best for longer responses with natural pauses
|
||||
smart_turn_params = SmartTurnParams(stop_secs=smart_turn_stop_secs)
|
||||
user_turn_strategies = UserTurnStrategies(
|
||||
start=[VADUserTurnStartStrategy(), TranscriptionUserTurnStartStrategy()],
|
||||
stop=[
|
||||
TurnAnalyzerUserTurnStopStrategy(
|
||||
turn_analyzer=LocalSmartTurnAnalyzerV3()
|
||||
turn_analyzer=LocalSmartTurnAnalyzerV3(params=smart_turn_params)
|
||||
)
|
||||
],
|
||||
)
|
||||
else:
|
||||
# Transcription-based (default): best for short 1-2 word responses
|
||||
user_turn_strategies = UserTurnStrategies(
|
||||
start=[VADUserTurnStartStrategy(), TranscriptionUserTurnStartStrategy()],
|
||||
stop=[TranscriptionUserTurnStopStrategy()],
|
||||
)
|
||||
|
||||
# Create user mute strategies
|
||||
# - CallbackUserMuteStrategy: mutes based on engine's _mute_pipeline state
|
||||
|
|
|
|||
|
|
@ -125,15 +125,11 @@ async def process_knowledge_base_document(
|
|||
embeddings_api_key = None
|
||||
embeddings_model = None
|
||||
if document.created_by:
|
||||
user_config = await db_client.get_user_configurations(
|
||||
document.created_by
|
||||
)
|
||||
user_config = await db_client.get_user_configurations(document.created_by)
|
||||
if user_config.embeddings:
|
||||
embeddings_api_key = user_config.embeddings.api_key
|
||||
embeddings_model = user_config.embeddings.model
|
||||
logger.info(
|
||||
f"Using user embeddings config: model={embeddings_model}"
|
||||
)
|
||||
logger.info(f"Using user embeddings config: model={embeddings_model}")
|
||||
|
||||
# Check if API key is configured
|
||||
if not embeddings_api_key:
|
||||
|
|
|
|||
|
|
@ -4,8 +4,9 @@ import { Button } from "@/components/ui/button";
|
|||
import { Dialog, DialogContent, DialogFooter, DialogHeader, DialogTitle } from "@/components/ui/dialog";
|
||||
import { Input } from "@/components/ui/input";
|
||||
import { Label } from "@/components/ui/label";
|
||||
import { Select, SelectContent, SelectItem, SelectTrigger, SelectValue } from "@/components/ui/select";
|
||||
import { Switch } from "@/components/ui/switch";
|
||||
import { AmbientNoiseConfiguration, VADConfiguration, WorkflowConfigurations } from "@/types/workflow-configurations";
|
||||
import { AmbientNoiseConfiguration, TurnStopStrategy, WorkflowConfigurations } from "@/types/workflow-configurations";
|
||||
|
||||
interface ConfigurationsDialogProps {
|
||||
open: boolean;
|
||||
|
|
@ -15,13 +16,6 @@ interface ConfigurationsDialogProps {
|
|||
onSave: (configurations: WorkflowConfigurations, workflowName: string) => Promise<void>;
|
||||
}
|
||||
|
||||
const DEFAULT_VAD_CONFIG: VADConfiguration = {
|
||||
confidence: 0.7,
|
||||
start_seconds: 0.4,
|
||||
stop_seconds: 0.8,
|
||||
minimum_volume: 0.6,
|
||||
};
|
||||
|
||||
const DEFAULT_AMBIENT_NOISE_CONFIG: AmbientNoiseConfiguration = {
|
||||
enabled: false,
|
||||
volume: 0.3,
|
||||
|
|
@ -35,9 +29,6 @@ export const ConfigurationsDialog = ({
|
|||
onSave
|
||||
}: ConfigurationsDialogProps) => {
|
||||
const [name, setName] = useState<string>(workflowName);
|
||||
const [vadConfig, setVadConfig] = useState<VADConfiguration>(
|
||||
workflowConfigurations?.vad_configuration || DEFAULT_VAD_CONFIG
|
||||
);
|
||||
const [ambientNoiseConfig, setAmbientNoiseConfig] = useState<AmbientNoiseConfiguration>(
|
||||
workflowConfigurations?.ambient_noise_configuration || DEFAULT_AMBIENT_NOISE_CONFIG
|
||||
);
|
||||
|
|
@ -47,16 +38,23 @@ export const ConfigurationsDialog = ({
|
|||
const [maxUserIdleTimeout, setMaxUserIdleTimeout] = useState<number>(
|
||||
workflowConfigurations?.max_user_idle_timeout || 10 // Default 10 seconds
|
||||
);
|
||||
const [smartTurnStopSecs, setSmartTurnStopSecs] = useState<number>(
|
||||
workflowConfigurations?.smart_turn_stop_secs || 2 // Default 2 seconds
|
||||
);
|
||||
const [turnStopStrategy, setTurnStopStrategy] = useState<TurnStopStrategy>(
|
||||
workflowConfigurations?.turn_stop_strategy || 'transcription'
|
||||
);
|
||||
const [isSaving, setIsSaving] = useState(false);
|
||||
|
||||
const handleSave = async () => {
|
||||
setIsSaving(true);
|
||||
try {
|
||||
await onSave({
|
||||
vad_configuration: vadConfig,
|
||||
ambient_noise_configuration: ambientNoiseConfig,
|
||||
max_call_duration: maxCallDuration,
|
||||
max_user_idle_timeout: maxUserIdleTimeout
|
||||
max_user_idle_timeout: maxUserIdleTimeout,
|
||||
smart_turn_stop_secs: smartTurnStopSecs,
|
||||
turn_stop_strategy: turnStopStrategy
|
||||
}, name);
|
||||
onOpenChange(false);
|
||||
} catch (error) {
|
||||
|
|
@ -70,23 +68,14 @@ export const ConfigurationsDialog = ({
|
|||
useEffect(() => {
|
||||
if (open) {
|
||||
setName(workflowName);
|
||||
setVadConfig(workflowConfigurations?.vad_configuration || DEFAULT_VAD_CONFIG);
|
||||
setAmbientNoiseConfig(workflowConfigurations?.ambient_noise_configuration || DEFAULT_AMBIENT_NOISE_CONFIG);
|
||||
setMaxCallDuration(workflowConfigurations?.max_call_duration || 600);
|
||||
setMaxUserIdleTimeout(workflowConfigurations?.max_user_idle_timeout || 10);
|
||||
setSmartTurnStopSecs(workflowConfigurations?.smart_turn_stop_secs || 2);
|
||||
setTurnStopStrategy(workflowConfigurations?.turn_stop_strategy || 'transcription');
|
||||
}
|
||||
}, [open, workflowName, workflowConfigurations]);
|
||||
|
||||
const handleVadChange = (field: keyof VADConfiguration, value: string) => {
|
||||
const numValue = parseFloat(value);
|
||||
if (!isNaN(numValue)) {
|
||||
setVadConfig(prev => ({
|
||||
...prev,
|
||||
[field]: numValue
|
||||
}));
|
||||
}
|
||||
};
|
||||
|
||||
return (
|
||||
<Dialog open={open} onOpenChange={onOpenChange}>
|
||||
<DialogContent className="max-w-lg">
|
||||
|
|
@ -117,76 +106,6 @@ export const ConfigurationsDialog = ({
|
|||
</div>
|
||||
</div>
|
||||
|
||||
{/* Voice Activity Detection Section */}
|
||||
<div className="space-y-4">
|
||||
<div>
|
||||
<h3 className="text-sm font-semibold mb-1">Voice Activity Detection</h3>
|
||||
<p className="text-xs text-muted-foreground">
|
||||
Hyperparameters to set for voice activity detection. Already configured with defaults.
|
||||
</p>
|
||||
</div>
|
||||
|
||||
<div className="grid grid-cols-2 gap-4">
|
||||
<div className="space-y-2">
|
||||
<Label htmlFor="confidence" className="text-xs">
|
||||
Confidence
|
||||
</Label>
|
||||
<Input
|
||||
id="confidence"
|
||||
type="number"
|
||||
step="0.1"
|
||||
min="0"
|
||||
max="1"
|
||||
value={vadConfig.confidence}
|
||||
onChange={(e) => handleVadChange('confidence', e.target.value)}
|
||||
/>
|
||||
</div>
|
||||
|
||||
<div className="space-y-2">
|
||||
<Label htmlFor="start_seconds" className="text-xs">
|
||||
Start Seconds
|
||||
</Label>
|
||||
<Input
|
||||
id="start_seconds"
|
||||
type="number"
|
||||
step="0.1"
|
||||
min="0"
|
||||
value={vadConfig.start_seconds}
|
||||
onChange={(e) => handleVadChange('start_seconds', e.target.value)}
|
||||
/>
|
||||
</div>
|
||||
|
||||
<div className="space-y-2">
|
||||
<Label htmlFor="stop_seconds" className="text-xs">
|
||||
Stop Seconds
|
||||
</Label>
|
||||
<Input
|
||||
id="stop_seconds"
|
||||
type="number"
|
||||
step="0.1"
|
||||
min="0"
|
||||
value={vadConfig.stop_seconds}
|
||||
onChange={(e) => handleVadChange('stop_seconds', e.target.value)}
|
||||
/>
|
||||
</div>
|
||||
|
||||
<div className="space-y-2">
|
||||
<Label htmlFor="minimum_volume" className="text-xs">
|
||||
Minimum Volume
|
||||
</Label>
|
||||
<Input
|
||||
id="minimum_volume"
|
||||
type="number"
|
||||
step="0.1"
|
||||
min="0"
|
||||
max="1"
|
||||
value={vadConfig.minimum_volume}
|
||||
onChange={(e) => handleVadChange('minimum_volume', e.target.value)}
|
||||
/>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
{/* Ambient Noise Section */}
|
||||
<div className="space-y-4">
|
||||
<div>
|
||||
|
|
@ -234,6 +153,68 @@ export const ConfigurationsDialog = ({
|
|||
</div>
|
||||
</div>
|
||||
|
||||
{/* Turn Detection Section */}
|
||||
<div className="space-y-4">
|
||||
<div>
|
||||
<h3 className="text-sm font-semibold mb-1">Turn Detection</h3>
|
||||
<p className="text-xs text-muted-foreground">
|
||||
Configure how the agent detects when the user has finished speaking.
|
||||
</p>
|
||||
</div>
|
||||
|
||||
<div className="space-y-2">
|
||||
<Label htmlFor="turn_stop_strategy" className="text-xs">
|
||||
Detection Strategy
|
||||
</Label>
|
||||
<Select
|
||||
value={turnStopStrategy}
|
||||
onValueChange={(value: TurnStopStrategy) => setTurnStopStrategy(value)}
|
||||
>
|
||||
<SelectTrigger id="turn_stop_strategy">
|
||||
<SelectValue placeholder="Select strategy" />
|
||||
</SelectTrigger>
|
||||
<SelectContent>
|
||||
<SelectItem value="transcription">
|
||||
Transcription-based
|
||||
</SelectItem>
|
||||
<SelectItem value="turn_analyzer">
|
||||
Smart Turn Analyzer
|
||||
</SelectItem>
|
||||
</SelectContent>
|
||||
</Select>
|
||||
<p className="text-xs text-muted-foreground">
|
||||
{turnStopStrategy === 'transcription'
|
||||
? "Best for short responses (1-2 word statements). Ends turn when transcription indicates completion."
|
||||
: "Best for longer responses with natural pauses. Uses ML model to detect end of turn."}
|
||||
</p>
|
||||
</div>
|
||||
|
||||
{turnStopStrategy === 'turn_analyzer' && (
|
||||
<div className="space-y-2">
|
||||
<Label htmlFor="smart_turn_stop_secs" className="text-xs">
|
||||
Incomplete Turn Timeout (seconds)
|
||||
</Label>
|
||||
<Input
|
||||
id="smart_turn_stop_secs"
|
||||
type="number"
|
||||
step="0.5"
|
||||
min="0.5"
|
||||
max="10"
|
||||
value={smartTurnStopSecs}
|
||||
onChange={(e) => {
|
||||
const value = parseFloat(e.target.value);
|
||||
if (!isNaN(value) && value >= 0.5) {
|
||||
setSmartTurnStopSecs(value);
|
||||
}
|
||||
}}
|
||||
/>
|
||||
<p className="text-xs text-muted-foreground">
|
||||
Max silence duration before ending an incomplete turn. Default: 2 seconds
|
||||
</p>
|
||||
</div>
|
||||
)}
|
||||
</div>
|
||||
|
||||
{/* Call Management Section */}
|
||||
<div className="space-y-4">
|
||||
<div>
|
||||
|
|
|
|||
|
|
@ -10,27 +10,27 @@ export interface AmbientNoiseConfiguration {
|
|||
volume: number;
|
||||
}
|
||||
|
||||
export type TurnStopStrategy = 'transcription' | 'turn_analyzer';
|
||||
|
||||
export interface WorkflowConfigurations {
|
||||
vad_configuration: VADConfiguration;
|
||||
vad_configuration?: VADConfiguration;
|
||||
ambient_noise_configuration: AmbientNoiseConfiguration;
|
||||
max_call_duration: number; // Maximum call duration in seconds
|
||||
max_user_idle_timeout: number; // Maximum user idle time in seconds
|
||||
smart_turn_stop_secs: number; // Timeout in seconds for incomplete turn detection
|
||||
turn_stop_strategy: TurnStopStrategy; // Strategy for detecting end of user turn
|
||||
dictionary?: string; // Comma-separated words for voice agent to listen for
|
||||
[key: string]: unknown; // Allow additional properties for future configurations
|
||||
}
|
||||
|
||||
export const DEFAULT_WORKFLOW_CONFIGURATIONS: WorkflowConfigurations = {
|
||||
vad_configuration: {
|
||||
confidence: 0.7,
|
||||
start_seconds: 0.4,
|
||||
stop_seconds: 0.8,
|
||||
minimum_volume: 0.6
|
||||
},
|
||||
ambient_noise_configuration: {
|
||||
enabled: false,
|
||||
volume: 0.3
|
||||
},
|
||||
max_call_duration: 600, // 10 minutes
|
||||
max_user_idle_timeout: 10, // 10 seconds
|
||||
smart_turn_stop_secs: 2, // 2 seconds
|
||||
turn_stop_strategy: 'transcription', // Default to transcription-based detection
|
||||
dictionary: ''
|
||||
};
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue