feat: add transcript panel during live call for better visibility (#116)

* chore: remove old signaling route

* Show real time feedbacks
This commit is contained in:
Abhishek 2026-01-13 22:48:18 +05:30 committed by GitHub
parent ad4cff73c8
commit e7712474c1
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
15 changed files with 599 additions and 469 deletions

View file

@ -20,7 +20,6 @@ if SENTRY_DSN and (
print(f"Sentry initialized in environment: {ENVIRONMENT}")
import asyncio
from contextlib import asynccontextmanager
from typing import Optional
@ -30,7 +29,6 @@ from fastapi.middleware.cors import CORSMiddleware
from loguru import logger
from api.routes.main import router as main_router
from api.routes.rtc_offer import pcs_map
from api.services.telephony.worker_event_subscriber import (
WorkerEventSubscriber,
setup_worker_subscriber,
@ -77,11 +75,6 @@ async def lifespan(app: FastAPI):
# Fall back to immediate stop
await worker_subscriber.stop()
# close all dangling pipecat connections
coros = [pc.close() for pc in pcs_map.values()]
await asyncio.gather(*coros)
pcs_map.clear()
await redis.aclose()

View file

@ -10,7 +10,6 @@ from api.routes.organization_usage import router as organization_usage_router
from api.routes.public_agent import router as public_agent_router
from api.routes.public_embed import router as public_embed_router
from api.routes.reports import router as reports_router
from api.routes.rtc_offer import router as rtc_offer_router
from api.routes.s3_signed_url import router as s3_router
from api.routes.service_keys import router as service_keys_router
from api.routes.superuser import router as superuser_router
@ -27,7 +26,6 @@ router = APIRouter(
)
router.include_router(telephony_router)
router.include_router(rtc_offer_router)
router.include_router(superuser_router)
router.include_router(workflow_router)
router.include_router(user_router)

View file

@ -1,77 +0,0 @@
from typing import Dict
from fastapi import APIRouter, BackgroundTasks, Depends
from loguru import logger
from pydantic import BaseModel
from api.db.models import UserModel
from api.services.auth.depends import get_user
from api.services.pipecat.run_pipeline import run_pipeline_smallwebrtc
from pipecat.transports.smallwebrtc.connection import SmallWebRTCConnection
from pipecat.utils.context import set_current_run_id
router = APIRouter(prefix="/pipecat")
pcs_map: Dict[str, SmallWebRTCConnection] = {}
ice_servers = ["stun:stun.l.google.com:19302"]
class RTCOfferRequest(BaseModel):
pc_id: str | None
sdp: str
type: str
workflow_id: int
workflow_run_id: int
restart_pc: bool = False
call_context_vars: dict | None = None
@router.post("/rtc-offer")
async def offer(
request: RTCOfferRequest,
background_tasks: BackgroundTasks,
user: UserModel = Depends(get_user),
):
pc_id = request.pc_id
if pc_id and pc_id in pcs_map:
# Ensure run_id context is available for logs even when reusing an existing PC.
set_current_run_id(request.workflow_run_id)
pipecat_connection = pcs_map[pc_id]
logger.info(f"Reusing existing connection for pc_id: {pc_id}")
await pipecat_connection.renegotiate(
sdp=request.sdp,
type=request.type,
restart_pc=request.restart_pc,
)
else:
# Set the run_id *before* creating the SmallWebRTCConnection so that all
# async tasks and event-handler coroutines spawned inside the
# constructor inherit the correct context variable value. Otherwise the
# default ("NA") leaks into the log output produced by those tasks.
set_current_run_id(request.workflow_run_id)
pipecat_connection = SmallWebRTCConnection(ice_servers)
await pipecat_connection.initialize(sdp=request.sdp, type=request.type)
@pipecat_connection.event_handler("closed")
async def handle_disconnected(webrtc_connection: SmallWebRTCConnection):
logger.info(
f"In pipecat connection closed handler. Popping peer connection pc_id: {webrtc_connection.pc_id} from pcs_map"
)
pcs_map.pop(webrtc_connection.pc_id, None)
background_tasks.add_task(
run_pipeline_smallwebrtc,
pipecat_connection,
request.workflow_id,
request.workflow_run_id,
user.id,
request.call_context_vars or {},
)
answer = pipecat_connection.get_answer()
pcs_map[answer["pc_id"]] = pipecat_connection
return answer

View file

@ -18,11 +18,16 @@ from aiortc import RTCIceServer
from aiortc.sdp import candidate_from_sdp
from fastapi import APIRouter, Depends, WebSocket, WebSocketDisconnect
from loguru import logger
from starlette.websockets import WebSocketState
from api.db import db_client
from api.db.models import UserModel
from api.services.auth.depends import get_user_ws
from api.services.pipecat.run_pipeline import run_pipeline_smallwebrtc
from api.services.pipecat.ws_sender_registry import (
register_ws_sender,
unregister_ws_sender,
)
from api.services.quota_service import check_dograh_quota
from pipecat.transports.smallwebrtc.connection import SmallWebRTCConnection
from pipecat.utils.context import set_current_run_id
@ -92,6 +97,9 @@ class SignalingManager:
# Cleanup
self._connections.pop(connection_id, None)
# Unregister WebSocket sender for real-time feedback
unregister_ws_sender(workflow_run_id)
# Clean up all peer connections for this workflow run
# Note: In a WebSocket-based signaling approach (vs HTTP PATCH),
# we maintain our own connection map instead of relying on
@ -182,6 +190,13 @@ class SignalingManager:
# Store peer connection using client's pc_id
self._peer_connections[pc_id] = pc
# Register WebSocket sender for real-time feedback
async def ws_sender(message: dict):
if ws.application_state == WebSocketState.CONNECTED:
await ws.send_json(message)
register_ws_sender(workflow_run_id, ws_sender)
# Setup closed handler
@pc.event_handler("closed")
async def handle_disconnected(webrtc_connection: SmallWebRTCConnection):

View file

@ -0,0 +1,227 @@
"""Real-time feedback observer for sending pipeline events to the frontend.
This observer watches pipeline frames and sends relevant events (transcriptions,
bot text) over WebSocket to provide real-time feedback in the UI.
For frames with presentation timestamps (pts), like TTSTextFrame, we respect
the timing by queuing them and sending at the appropriate time, similar to
how base_output.py handles timed frames.
"""
import asyncio
import time
from typing import Awaitable, Callable, Optional, Set
from loguru import logger
from pipecat.frames.frames import (
CancelFrame,
EndFrame,
FunctionCallInProgressFrame,
FunctionCallResultFrame,
InterimTranscriptionFrame,
InterruptionFrame,
StopFrame,
TranscriptionFrame,
TTSTextFrame,
)
from pipecat.observers.base_observer import BaseObserver, FramePushed
from pipecat.processors.frame_processor import FrameDirection
from pipecat.utils.time import nanoseconds_to_seconds
class RealtimeFeedbackObserver(BaseObserver):
"""Observer that sends real-time transcription and bot response events via WebSocket.
For frames with pts (presentation timestamp), we queue them and send at the
appropriate time to sync with audio playback.
"""
def __init__(
self,
ws_sender: Callable[[dict], Awaitable[None]],
):
"""
Args:
ws_sender: Async function to send messages over WebSocket.
Expected signature: async def send(message: dict) -> None
"""
super().__init__()
self._ws_sender = ws_sender
self._frames_seen: Set[str] = set()
# Clock/timing for pts-based frames (similar to base_output.py)
self._clock_queue: Optional[asyncio.PriorityQueue] = None
self._clock_task: Optional[asyncio.Task] = None
self._clock_start_time: Optional[float] = (
None # Wall clock time when we started
)
self._pts_start_time: Optional[int] = None # First pts value we saw
async def _ensure_clock_task(self):
"""Create the clock task if it doesn't exist."""
if self._clock_queue is None:
self._clock_queue = asyncio.PriorityQueue()
self._clock_task = asyncio.create_task(self._clock_task_handler())
async def _cancel_clock_task(self):
"""Cancel the clock task and clear the queue.
Called on interruption to discard any pending bot text that
hasn't been sent yet.
"""
if self._clock_task:
self._clock_task.cancel()
try:
await self._clock_task
except asyncio.CancelledError:
pass
self._clock_task = None
self._clock_queue = None
# Reset timing references so next bot response starts fresh
self._clock_start_time = None
self._pts_start_time = None
async def _handle_interruption(self):
"""Handle interruption by clearing queued bot text.
Similar to base_output.py's handle_interruptions, we cancel the
clock task and recreate it to discard pending frames.
"""
await self._cancel_clock_task()
async def _clock_task_handler(self):
"""Process timed frames from the queue, respecting their presentation timestamps.
Similar to base_output.py's _clock_task_handler, we wait until the
frame's pts time has arrived before sending.
"""
while True:
try:
pts, _frame_id, message = await self._clock_queue.get()
# Calculate when to send based on pts relative to our start time
if (
self._clock_start_time is not None
and self._pts_start_time is not None
):
# Target time = start wall time + (frame pts - start pts) in seconds
target_time = self._clock_start_time + nanoseconds_to_seconds(
pts - self._pts_start_time
)
current_time = time.time()
if target_time > current_time:
await asyncio.sleep(target_time - current_time)
# Send the message
await self._send_message(message)
self._clock_queue.task_done()
except asyncio.CancelledError:
break
except Exception as e:
logger.debug(f"Clock task error: {e}")
async def on_push_frame(self, data: FramePushed):
"""Process frames and send relevant ones to the client."""
frame = data.frame
frame_direction = data.direction
# Handle pipeline termination - stop clock task
if isinstance(frame, (EndFrame, CancelFrame, StopFrame)):
await self._cancel_clock_task()
return
# Handle interruptions - clear any queued bot text
if isinstance(frame, InterruptionFrame):
await self._handle_interruption()
return
# Skip already processed frames (frames can be observed multiple times)
if frame.id in self._frames_seen:
return
self._frames_seen.add(frame.id)
# Handle user transcriptions (interim)
if isinstance(frame, InterimTranscriptionFrame):
await self._send_message(
{
"type": "rtf-user-transcription",
"payload": {
"text": frame.text,
"final": False,
"user_id": frame.user_id,
"timestamp": frame.timestamp,
},
}
)
# Handle user transcriptions (final)
elif isinstance(frame, TranscriptionFrame):
await self._send_message(
{
"type": "rtf-user-transcription",
"payload": {
"text": frame.text,
"final": True,
"user_id": frame.user_id,
"timestamp": frame.timestamp,
},
}
)
# Handle bot TTS text - respect pts timing
elif isinstance(frame, TTSTextFrame):
message = {
"type": "rtf-bot-text",
"payload": {
"text": frame.text,
},
}
# If frame has pts, queue it for timed delivery
if frame.pts:
# Initialize timing reference on first pts frame
if self._pts_start_time is None:
self._pts_start_time = frame.pts
self._clock_start_time = time.time()
await self._ensure_clock_task()
await self._clock_queue.put((frame.pts, frame.id, message))
else:
# No pts, send immediately
await self._send_message(message)
# Handle function call in progress
elif (
isinstance(frame, FunctionCallInProgressFrame)
and frame_direction == FrameDirection.DOWNSTREAM
):
await self._send_message(
{
"type": "rtf-function-call-start",
"payload": {
"function_name": frame.function_name,
"tool_call_id": frame.tool_call_id,
},
}
)
# Handle function call result
elif (
isinstance(frame, FunctionCallResultFrame)
and frame_direction == FrameDirection.DOWNSTREAM
):
await self._send_message(
{
"type": "rtf-function-call-end",
"payload": {
"function_name": frame.function_name,
"tool_call_id": frame.tool_call_id,
"result": str(frame.result) if frame.result else None,
},
}
)
async def _send_message(self, message: dict):
"""Send message via WebSocket, handling errors gracefully."""
try:
await self._ws_sender(message)
except Exception as e:
# Log but don't fail - feedback is non-critical
logger.debug(f"Failed to send real-time feedback message: {e}")

View file

@ -23,6 +23,7 @@ from api.services.pipecat.pipeline_engine_callbacks_processor import (
PipelineEngineCallbacksProcessor,
)
from api.services.pipecat.pipeline_metrics_aggregator import PipelineMetricsAggregator
from api.services.pipecat.realtime_feedback_observer import RealtimeFeedbackObserver
from api.services.pipecat.service_factory import (
create_llm_service,
create_stt_service,
@ -38,6 +39,7 @@ from api.services.pipecat.transport_setup import (
create_vonage_transport,
create_webrtc_transport,
)
from api.services.pipecat.ws_sender_registry import get_ws_sender
from api.services.telephony.stasis_rtp_connection import StasisRTPConnection
from api.services.workflow.dto import ReactFlowDTO
from api.services.workflow.pipecat_engine import PipecatEngine
@ -564,6 +566,12 @@ async def _run_pipeline(
# Create pipeline task with audio configuration
task = create_pipeline_task(pipeline, workflow_run_id, audio_config)
# Add real-time feedback observer if WebSocket sender is available
ws_sender = get_ws_sender(workflow_run_id)
if ws_sender:
feedback_observer = RealtimeFeedbackObserver(ws_sender=ws_sender)
task.add_observer(feedback_observer)
# Now set the task on the engine
engine.set_task(task)

View file

@ -0,0 +1,28 @@
"""Registry to store WebSocket senders by workflow_run_id.
This allows the pipeline observer to send messages back through
the signaling WebSocket without passing the WebSocket directly.
"""
from typing import Awaitable, Callable, Dict, Optional
_ws_senders: Dict[int, Callable[[dict], Awaitable[None]]] = {}
def register_ws_sender(
workflow_run_id: int, sender: Callable[[dict], Awaitable[None]]
) -> None:
"""Register a WebSocket sender for a workflow run."""
_ws_senders[workflow_run_id] = sender
def unregister_ws_sender(workflow_run_id: int) -> None:
"""Unregister a WebSocket sender for a workflow run."""
_ws_senders.pop(workflow_run_id, None)
def get_ws_sender(
workflow_run_id: int,
) -> Optional[Callable[[dict], Awaitable[None]]]:
"""Get the WebSocket sender for a workflow run."""
return _ws_senders.get(workflow_run_id)

View file

@ -9,6 +9,7 @@ import {
ApiKeyErrorDialog,
AudioControls,
ConnectionStatus,
RealtimeFeedbackPanel,
WorkflowConfigErrorDialog
} from "./components";
import { useWebSocketRTC } from "./hooks";
@ -40,7 +41,8 @@ const BrowserCall = ({ workflowId, workflowRunId, accessToken, initialContextVar
start,
stop,
isStarting,
getAudioInputDevices
getAudioInputDevices,
feedbackMessages,
} = useWebSocketRTC({ workflowId, workflowRunId, accessToken, initialContextVariables });
// Poll for recording availability after call ends
@ -93,44 +95,61 @@ const BrowserCall = ({ workflowId, workflowRunId, accessToken, initialContextVar
return (
<>
<Card className="w-full max-w-4xl mx-auto">
<CardHeader>
<CardTitle>Call Voice Agent</CardTitle>
</CardHeader>
<div className="flex h-full w-full">
{/* Main content - 2/3 width when panel visible, full width otherwise */}
<div className="w-2/3 h-full">
<div className="flex justify-center items-center h-full px-8">
<Card className="w-full max-w-xl">
<CardHeader>
<CardTitle>Call Voice Agent</CardTitle>
</CardHeader>
<CardContent>
{isCompleted && checkingForRecording ? (
<div className="flex flex-col items-center justify-center space-y-4 p-8">
<Loader2 className="h-8 w-8 animate-spin text-primary" />
<div className="text-center space-y-2">
<p className="text-foreground font-medium">Processing your call</p>
<p className="text-sm text-muted-foreground">Fetching transcript and recording...</p>
</div>
</div>
) : (
<>
<AudioControls
audioInputs={audioInputs}
selectedAudioInput={selectedAudioInput}
setSelectedAudioInput={setSelectedAudioInput}
isCompleted={isCompleted}
connectionActive={connectionActive}
permissionError={permissionError}
start={start}
stop={stop}
isStarting={isStarting}
getAudioInputDevices={getAudioInputDevices}
/>
<CardContent>
{isCompleted && checkingForRecording ? (
<div className="flex flex-col items-center justify-center space-y-4 p-8">
<Loader2 className="h-8 w-8 animate-spin text-primary" />
<div className="text-center space-y-2">
<p className="text-foreground font-medium">Processing your call</p>
<p className="text-sm text-muted-foreground">Fetching transcript and recording...</p>
</div>
</div>
) : (
<>
<AudioControls
audioInputs={audioInputs}
selectedAudioInput={selectedAudioInput}
setSelectedAudioInput={setSelectedAudioInput}
isCompleted={isCompleted}
connectionActive={connectionActive}
permissionError={permissionError}
start={start}
stop={stop}
isStarting={isStarting}
getAudioInputDevices={getAudioInputDevices}
/>
<ConnectionStatus
connectionStatus={connectionStatus}
/>
</>
)}
</CardContent>
<ConnectionStatus
connectionStatus={connectionStatus}
/>
</>
)}
</CardContent>
<audio ref={audioRef} autoPlay playsInline className="hidden" />
</Card>
<audio ref={audioRef} autoPlay playsInline className="hidden" />
</Card>
</div>
</div>
{/* Show transcript panel */}
<div className="w-1/3 h-full shrink-0">
<RealtimeFeedbackPanel
messages={feedbackMessages}
isVisible={true}
isCallActive={connectionActive}
isCallCompleted={isCompleted}
/>
</div>
</div>
<ApiKeyErrorDialog
open={apiKeyModalOpen}

View file

@ -0,0 +1,152 @@
"use client";
import { Loader2, MessageSquare, Mic, MicOff, Wrench } from "lucide-react";
import { useEffect, useRef } from "react";
import { cn } from "@/lib/utils";
import { FeedbackMessage } from "../hooks/useWebSocketRTC";
interface RealtimeFeedbackPanelProps {
messages: FeedbackMessage[];
isVisible: boolean;
isCallActive: boolean;
isCallCompleted: boolean;
}
const MessageItem = ({ msg }: { msg: FeedbackMessage }) => {
// Function call message - centered
if (msg.type === 'function-call') {
return (
<div className="flex justify-center">
<div className="px-3 py-1.5 rounded-full text-xs bg-amber-500/10 border border-amber-500/20 inline-flex items-center gap-2">
{msg.status === 'running' ? (
<Loader2 className="h-3 w-3 animate-spin text-amber-500" />
) : (
<Wrench className="h-3 w-3 text-amber-500" />
)}
<span className="font-mono text-amber-700 dark:text-amber-400">
{msg.functionName}()
</span>
{msg.status === 'completed' && (
<span className="text-muted-foreground"></span>
)}
</div>
</div>
);
}
const isUser = msg.type === 'user-transcription';
// User messages on right, bot messages on left
return (
<div className={cn(
"flex",
isUser ? "justify-end" : "justify-start"
)}>
<div
className={cn(
"max-w-[85%] px-3 py-2 rounded-2xl text-sm",
isUser
? "bg-primary text-primary-foreground rounded-br-md"
: "bg-muted rounded-bl-md",
!msg.final && "opacity-70"
)}
>
<div className="whitespace-pre-wrap leading-relaxed">{msg.text}</div>
{!msg.final && (
<div className={cn(
"text-[10px] mt-1 italic",
isUser ? "text-primary-foreground/70" : "text-muted-foreground"
)}>
speaking...
</div>
)}
</div>
</div>
);
};
export const RealtimeFeedbackPanel = ({
messages,
isVisible,
isCallActive,
isCallCompleted
}: RealtimeFeedbackPanelProps) => {
const scrollRef = useRef<HTMLDivElement>(null);
// Auto-scroll to bottom when new messages arrive
useEffect(() => {
if (scrollRef.current) {
scrollRef.current.scrollTop = scrollRef.current.scrollHeight;
}
}, [messages]);
if (!isVisible) return null;
return (
<div className="w-full h-full flex flex-col bg-background border-l border-border">
{/* Header */}
<div className="px-4 py-3 border-b border-border shrink-0">
<div className="flex items-center justify-center gap-2">
<MessageSquare className="h-4 w-4 text-muted-foreground shrink-0" />
<span className="font-medium text-sm whitespace-nowrap">Live Transcript</span>
<div className={cn(
"flex items-center gap-1 text-xs px-2 py-0.5 rounded-full shrink-0",
isCallActive
? "bg-green-500/10 text-green-600 dark:text-green-400"
: isCallCompleted
? "bg-muted text-muted-foreground"
: "bg-muted text-muted-foreground"
)}>
{isCallActive ? (
<>
<Mic className="h-3 w-3" />
<span>Live</span>
</>
) : isCallCompleted ? (
<>
<MicOff className="h-3 w-3" />
<span>Ended</span>
</>
) : (
<>
<MicOff className="h-3 w-3" />
<span>Ready</span>
</>
)}
</div>
</div>
</div>
{/* Messages */}
<div ref={scrollRef} className="flex-1 overflow-y-auto">
{messages.length === 0 ? (
<div className="flex flex-col items-center justify-center h-full text-muted-foreground text-sm">
<MessageSquare className="h-10 w-10 mb-4 opacity-30" />
<p className="font-medium">No messages yet</p>
<p className="text-xs mt-1 text-center px-4">
{isCallActive
? "Start speaking to see the transcript"
: "Start the call to begin the conversation"
}
</p>
</div>
) : (
<div className="space-y-3 p-4">
{messages.map((msg) => (
<MessageItem key={msg.id} msg={msg} />
))}
</div>
)}
</div>
{/* Footer with message count */}
{messages.length > 0 && (
<div className="px-4 py-2 border-t border-border text-xs text-muted-foreground shrink-0">
{messages.filter(m => m.type !== 'function-call').length} messages
</div>
)}
</div>
);
};

View file

@ -2,4 +2,5 @@ export * from './ApiKeyErrorDialog';
export * from './AudioControls';
export * from './ConnectionStatus';
export * from './ContextDisplay';
export * from './RealtimeFeedbackPanel';
export * from './WorkflowConfigErrorDialog'

View file

@ -1,3 +1,2 @@
export * from './useDeviceInputs';
export * from './useWebRTC';
export * from './useWebSocketRTC';

View file

@ -1,287 +0,0 @@
import { useRef, useState } from "react";
import { offerApiV1PipecatRtcOfferPost, validateUserConfigurationsApiV1UserConfigurationsUserValidateGet, validateWorkflowApiV1WorkflowWorkflowIdValidatePost } from "@/client/sdk.gen";
import { WorkflowValidationError } from "@/components/flow/types";
import logger from '@/lib/logger';
import { sdpFilterCodec } from "../utils";
import { useDeviceInputs } from "./useDeviceInputs";
interface UseWebRTCProps {
workflowId: number;
workflowRunId: number;
accessToken: string | null;
initialContextVariables?: Record<string, string> | null;
}
export const useWebRTC = ({ workflowId, workflowRunId, accessToken, initialContextVariables }: UseWebRTCProps) => {
const [connectionStatus, setConnectionStatus] = useState<'idle' | 'connecting' | 'connected' | 'failed'>('idle');
const [connectionActive, setConnectionActive] = useState(false);
const [isCompleted, setIsCompleted] = useState(false);
const [apiKeyModalOpen, setApiKeyModalOpen] = useState(false);
const [apiKeyError, setApiKeyError] = useState<string | null>(null);
const [workflowConfigModalOpen, setWorkflowConfigModalOpen] = useState(false);
const [workflowConfigError, setWorkflowConfigError] = useState<string | null>(null);
const [isStarting, setIsStarting] = useState(false);
// Use initial context variables directly, no UI for editing
const initialContext = initialContextVariables || {};
const {
audioInputs,
selectedAudioInput,
setSelectedAudioInput,
permissionError,
setPermissionError
} = useDeviceInputs();
const useStun = true;
const useAudio = true;
const audioCodec = 'default';
const audioRef = useRef<HTMLAudioElement>(null);
const pcRef = useRef<RTCPeerConnection | null>(null);
const timeStartRef = useRef<number | null>(null);
// Generate a cryptographically secure unique ID
const generateSecureId = () => {
// Use Web Crypto API to generate random bytes
const array = new Uint8Array(16);
crypto.getRandomValues(array);
// Convert to hex string
return 'PC-' + Array.from(array)
.map(b => b.toString(16).padStart(2, '0'))
.join('');
};
const pc_id = generateSecureId();
const createPeerConnection = () => {
const config: RTCConfiguration = {
iceServers: useStun ? [{ urls: ['stun:stun.l.google.com:19302'] }] : []
};
const pc = new RTCPeerConnection(config);
pc.addEventListener('icegatheringstatechange', () => {
logger.info(`ICE gathering state changed in createPeerConnection, ${pc.iceGatheringState}`);
});
pc.addEventListener('iceconnectionstatechange', () => {
logger.info(`ICE connection state changed: ${pc.iceConnectionState}`);
if (pc.iceConnectionState === 'connected' || pc.iceConnectionState === 'completed') {
setConnectionStatus('connected');
} else if (pc.iceConnectionState === 'failed' || pc.iceConnectionState === 'disconnected') {
setConnectionStatus('failed');
}
});
pc.addEventListener('track', (evt) => {
if (evt.track.kind === 'audio' && audioRef.current) {
audioRef.current.srcObject = evt.streams[0];
}
});
pcRef.current = pc;
return pc;
};
const negotiate = async () => {
const pc = pcRef.current;
if (!pc) return;
try {
const offer = await pc.createOffer();
await pc.setLocalDescription(offer);
await new Promise<void>((resolve) => {
if (pc.iceGatheringState === 'complete') {
resolve();
} else {
const checkState = () => {
if (pc.iceGatheringState === 'complete') {
logger.debug(`ICE gathering is complete in negotiate, ${pc.iceGatheringState}`);
pc.removeEventListener('icegatheringstatechange', checkState);
resolve();
}
};
pc.addEventListener('icegatheringstatechange', checkState);
}
});
const localDescription = pc.localDescription;
if (!localDescription) return;
let sdp = localDescription.sdp;
if (audioCodec !== 'default') {
sdp = sdpFilterCodec('audio', audioCodec, sdp);
}
if (!accessToken) return;
const response = await offerApiV1PipecatRtcOfferPost({
headers: {
'Authorization': `Bearer ${accessToken}`,
},
body: {
sdp: sdp,
type: 'offer',
pc_id: pc_id,
restart_pc: false,
workflow_id: workflowId,
workflow_run_id: workflowRunId,
call_context_vars: initialContext
}
});
if (response && response.data) {
const answerSdpText = typeof response.data === 'object' && 'sdp' in response.data
? response.data.sdp as string
: '';
await pc.setRemoteDescription({
type: 'answer',
sdp: answerSdpText
});
setConnectionActive(true);
}
} catch (e) {
logger.error(`Negotiation failed: ${e}`);
}
};
const start = async () => {
if (isStarting || !accessToken) return;
setIsStarting(true);
setConnectionStatus('connecting');
try {
const response = await validateUserConfigurationsApiV1UserConfigurationsUserValidateGet({
headers: {
'Authorization': `Bearer ${accessToken}`,
},
query: {
validity_ttl_seconds: 86400
},
});
if (response.error) {
setApiKeyModalOpen(true);
let msg = 'API Key Error';
const detail = (response.error as unknown as { detail?: { errors: { model: string; message: string }[] } }).detail;
if (Array.isArray(detail)) {
msg = detail
.map((e: { model: string; message: string }) => `${e.model}: ${e.message}`)
.join('\n');
}
setApiKeyError(msg);
return;
}
// Then check workflow validation
const workflowResponse = await validateWorkflowApiV1WorkflowWorkflowIdValidatePost({
path: {
workflow_id: workflowId,
},
headers: {
'Authorization': `Bearer ${accessToken}`,
},
});
if (workflowResponse.error) {
setWorkflowConfigModalOpen(true);
let msg = 'Workflow validation failed';
const errorDetail = workflowResponse.error as { detail?: { errors: WorkflowValidationError[] } };
if (errorDetail?.detail?.errors) {
msg = errorDetail.detail.errors
.map(err => `${err.kind}: ${err.message}`)
.join('\n');
}
setWorkflowConfigError(msg);
return;
}
timeStartRef.current = null;
const pc = createPeerConnection();
const constraints: MediaStreamConstraints = {
audio: false,
};
if (useAudio) {
const audioConstraints: MediaTrackConstraints = {};
if (selectedAudioInput) {
audioConstraints.deviceId = { exact: selectedAudioInput };
}
constraints.audio = Object.keys(audioConstraints).length ? audioConstraints : true;
}
if (constraints.audio) {
try {
const stream = await navigator.mediaDevices.getUserMedia(constraints);
stream.getTracks().forEach((track) => {
pc.addTrack(track, stream);
});
await negotiate();
} catch (err) {
logger.error(`Could not acquire media: ${err}`);
setPermissionError('Could not acquire media');
setConnectionStatus('failed');
}
} else {
await negotiate();
}
} finally {
setIsStarting(false);
}
};
const stop = () => {
setConnectionActive(false);
setIsCompleted(true);
setConnectionStatus('idle');
const pc = pcRef.current;
if (!pc) return;
if (pc.getTransceivers) {
pc.getTransceivers().forEach((transceiver) => {
if (transceiver.stop) {
transceiver.stop();
}
});
}
pc.getSenders().forEach((sender) => {
if (sender.track) {
sender.track.stop();
}
});
setTimeout(() => {
if (pcRef.current) {
pcRef.current.close();
pcRef.current = null;
}
}, 500);
};
return {
audioRef,
audioInputs,
selectedAudioInput,
setSelectedAudioInput,
connectionActive,
permissionError,
isCompleted,
apiKeyModalOpen,
setApiKeyModalOpen,
apiKeyError,
workflowConfigError,
workflowConfigModalOpen,
setWorkflowConfigModalOpen,
connectionStatus,
start,
stop,
isStarting,
initialContext
};
};

View file

@ -15,6 +15,16 @@ interface UseWebSocketRTCProps {
initialContextVariables?: Record<string, string> | null;
}
export interface FeedbackMessage {
id: string;
type: 'user-transcription' | 'bot-text' | 'function-call';
text: string;
final?: boolean;
timestamp: string;
functionName?: string;
status?: 'running' | 'completed';
}
export const useWebSocketRTC = ({ workflowId, workflowRunId, accessToken, initialContextVariables }: UseWebSocketRTCProps) => {
const [connectionStatus, setConnectionStatus] = useState<'idle' | 'connecting' | 'connected' | 'failed'>('idle');
const [connectionActive, setConnectionActive] = useState(false);
@ -24,6 +34,7 @@ export const useWebSocketRTC = ({ workflowId, workflowRunId, accessToken, initia
const [workflowConfigModalOpen, setWorkflowConfigModalOpen] = useState(false);
const [workflowConfigError, setWorkflowConfigError] = useState<string | null>(null);
const [isStarting, setIsStarting] = useState(false);
const [feedbackMessages, setFeedbackMessages] = useState<FeedbackMessage[]>([]);
const initialContext = initialContextVariables || {};
const {
@ -271,6 +282,105 @@ export const useWebSocketRTC = ({ workflowId, workflowRunId, accessToken, initia
}
break;
case 'rtf-user-transcription': {
const transcription = message.payload;
setFeedbackMessages(prev => {
// Mark last bot message as final (user started speaking)
const withBotFinalized = prev.map((m, i) =>
i === prev.length - 1 && m.type === 'bot-text' && !m.final
? { ...m, final: true }
: m
);
// For interim transcriptions, replace the last interim
if (!transcription.final) {
const withoutLastInterim = withBotFinalized.filter(
m => !(m.type === 'user-transcription' && !m.final)
);
return [...withoutLastInterim, {
id: `user-${Date.now()}`,
type: 'user-transcription',
text: transcription.text,
final: false,
timestamp: new Date().toISOString(),
}];
}
// For final transcriptions, replace interim with final
const withoutInterim = withBotFinalized.filter(
m => !(m.type === 'user-transcription' && !m.final)
);
return [...withoutInterim, {
id: `user-${Date.now()}`,
type: 'user-transcription',
text: transcription.text,
final: true,
timestamp: new Date().toISOString(),
}];
});
break;
}
case 'rtf-bot-text': {
// TTS text comes as sentences/phrases, concatenate with space
setFeedbackMessages(prev => {
const last = prev[prev.length - 1];
if (last && last.type === 'bot-text' && !last.final) {
// Append to existing bot message with space if needed
const existingText = last.text;
const newText = message.payload.text;
// Add space between chunks if previous doesn't end with space
// and new doesn't start with space or punctuation
const needsSpace = existingText.length > 0 &&
!existingText.endsWith(' ') &&
!newText.startsWith(' ') &&
!/^[.,!?;:]/.test(newText);
return [
...prev.slice(0, -1),
{ ...last, text: existingText + (needsSpace ? ' ' : '') + newText }
];
}
// Start new bot message
return [...prev, {
id: `bot-${Date.now()}`,
type: 'bot-text',
text: message.payload.text,
final: false,
timestamp: new Date().toISOString(),
}];
});
break;
}
case 'rtf-function-call-start': {
const { function_name, tool_call_id } = message.payload;
setFeedbackMessages(prev => {
// Check if we already have this function call
const existingId = `func-${tool_call_id}`;
if (prev.some(msg => msg.id === existingId)) {
return prev;
}
return [...prev, {
id: existingId,
type: 'function-call',
text: function_name,
functionName: function_name,
status: 'running',
timestamp: new Date().toISOString(),
}];
});
break;
}
case 'rtf-function-call-end': {
const { tool_call_id, result } = message.payload;
setFeedbackMessages(prev => prev.map(msg =>
msg.id === `func-${tool_call_id}`
? { ...msg, status: 'completed' as const, text: result || msg.text }
: msg
));
break;
}
default:
logger.warn('Unknown message type:', message.type);
}
@ -505,6 +615,7 @@ export const useWebSocketRTC = ({ workflowId, workflowRunId, accessToken, initia
stop,
isStarting,
initialContext,
getAudioInputDevices
getAudioInputDevices,
feedbackMessages,
};
};

File diff suppressed because one or more lines are too long

View file

@ -483,18 +483,6 @@ export type PresignedUploadUrlResponse = {
expires_in: number;
};
export type RtcOfferRequest = {
pc_id: string | null;
sdp: string;
type: string;
workflow_id: number;
workflow_run_id: number;
restart_pc?: boolean;
call_context_vars?: {
[key: string]: unknown;
} | null;
};
export type S3SignedUrlResponse = {
url: string;
expires_in: number;
@ -1260,37 +1248,6 @@ export type HandleInboundFallbackApiV1TelephonyInboundFallbackPostResponses = {
200: unknown;
};
export type OfferApiV1PipecatRtcOfferPostData = {
body: RtcOfferRequest;
headers?: {
authorization?: string | null;
'X-API-Key'?: string | null;
};
path?: never;
query?: never;
url: '/api/v1/pipecat/rtc-offer';
};
export type OfferApiV1PipecatRtcOfferPostErrors = {
/**
* Not found
*/
404: unknown;
/**
* Validation Error
*/
422: HttpValidationError;
};
export type OfferApiV1PipecatRtcOfferPostError = OfferApiV1PipecatRtcOfferPostErrors[keyof OfferApiV1PipecatRtcOfferPostErrors];
export type OfferApiV1PipecatRtcOfferPostResponses = {
/**
* Successful Response
*/
200: unknown;
};
export type ImpersonateApiV1SuperuserImpersonatePostData = {
body: ImpersonateRequest;
headers?: {