feat: add gemini realtime and speaches integration

- Add gemini realtime support
- Add speaches support for locally hosted LLMs
This commit is contained in:
Abhishek Kumar 2026-03-31 17:39:47 +05:30
parent 2eaaabd936
commit ee2028eb2d
19 changed files with 531 additions and 185 deletions

View file

@ -67,6 +67,10 @@ def setup_logging():
# Handler might already be removed
pass
# Set default extra values on the shared core so ALL logger references
# (including ones imported before this runs) have run_id available.
loguru.logger.configure(extra={"run_id": None})
# Patch loguru to inject run_id
patched = loguru.logger.patch(inject_run_id)

View file

@ -33,6 +33,7 @@ class DefaultConfigurationsResponse(TypedDict):
tts: dict[str, dict]
stt: dict[str, dict]
embeddings: dict[str, dict]
realtime: dict[str, dict]
default_providers: dict[str, str]
@ -55,6 +56,10 @@ async def get_default_configurations() -> DefaultConfigurationsResponse:
provider: model_cls.model_json_schema()
for provider, model_cls in REGISTRY[ServiceType.EMBEDDINGS].items()
},
"realtime": {
provider: model_cls.model_json_schema()
for provider, model_cls in REGISTRY[ServiceType.REALTIME].items()
},
"default_providers": DEFAULT_SERVICE_PROVIDERS,
}
return configurations
@ -75,6 +80,8 @@ class UserConfigurationRequestResponseSchema(BaseModel):
tts: dict[str, Union[str, float, list[str], None]] | None = None
stt: dict[str, Union[str, float, list[str], None]] | None = None
embeddings: dict[str, Union[str, float, list[str], None]] | None = None
realtime: dict[str, Union[str, float, list[str], None]] | None = None
is_realtime: bool | None = None
test_phone_number: str | None = None
timezone: str | None = None
organization_pricing: dict[str, Union[float, str, bool]] | None = None

View file

@ -5,6 +5,7 @@ from pydantic import BaseModel
from api.services.configuration.registry import (
EmbeddingsConfig,
LLMConfig,
RealtimeConfig,
STTConfig,
TTSConfig,
)
@ -15,6 +16,8 @@ class UserConfiguration(BaseModel):
stt: STTConfig | None = None
tts: TTSConfig | None = None
embeddings: EmbeddingsConfig | None = None
realtime: RealtimeConfig | None = None
is_realtime: bool = False
test_phone_number: str | None = None
timezone: str | None = None
last_validated_at: datetime | None = None

View file

@ -47,6 +47,8 @@ class UserConfigurationValidator:
ServiceProviders.CAMB.value: self._check_camb_api_key,
ServiceProviders.AWS_BEDROCK.value: self._check_aws_bedrock_api_key,
ServiceProviders.SPEACHES.value: self._check_speaches_api_key,
ServiceProviders.OPENAI_REALTIME.value: self._check_openai_api_key,
ServiceProviders.GOOGLE_REALTIME.value: self._check_google_api_key,
}
async def validate(
@ -70,6 +72,10 @@ class UserConfigurationValidator:
configuration.embeddings, "embeddings", required=False
)
)
# Realtime is optional - only validate if configured
status_list.extend(
self._validate_service(configuration.realtime, "realtime", required=False)
)
if status_list:
raise ValueError(status_list)

View file

@ -29,7 +29,7 @@ def contains_masked_key(api_key: str | list[str] | None) -> bool:
def check_for_masked_keys(config: "UserConfiguration") -> None:
"""Raise ValueError if any service in *config* still has a masked API key."""
for field in ("llm", "tts", "stt", "embeddings"):
for field in ("llm", "tts", "stt", "embeddings", "realtime"):
service = getattr(config, field, None)
if service is None:
continue
@ -121,6 +121,8 @@ def mask_user_config(config: UserConfiguration) -> Dict[str, Any]:
"tts": _mask_service(config.tts),
"stt": _mask_service(config.stt),
"embeddings": _mask_service(config.embeddings),
"realtime": _mask_service(config.realtime),
"is_realtime": config.is_realtime,
"test_phone_number": config.test_phone_number,
"timezone": config.timezone,
}

View file

@ -9,7 +9,7 @@ from typing import Dict
from api.schemas.user_configuration import UserConfiguration
from api.services.configuration.masking import resolve_masked_api_keys
SERVICE_FIELDS = ("llm", "tts", "stt", "embeddings")
SERVICE_FIELDS = ("llm", "tts", "stt", "embeddings", "realtime")
def merge_user_configurations(
@ -64,6 +64,9 @@ def merge_user_configurations(
_merge_service_block(service)
# other simple fields
if "is_realtime" in incoming_partial:
merged["is_realtime"] = incoming_partial["is_realtime"]
if "test_phone_number" in incoming_partial:
merged["test_phone_number"] = incoming_partial["test_phone_number"]

View file

@ -10,6 +10,7 @@ class ServiceType(Enum):
TTS = auto()
STT = auto()
EMBEDDINGS = auto()
REALTIME = auto()
class ServiceProviders(str, Enum):
@ -28,6 +29,8 @@ class ServiceProviders(str, Enum):
CAMB = "camb"
AWS_BEDROCK = "aws_bedrock"
SPEACHES = "speaches"
OPENAI_REALTIME = "openai_realtime"
GOOGLE_REALTIME = "google_realtime"
class BaseServiceConfiguration(BaseModel):
@ -42,6 +45,8 @@ class BaseServiceConfiguration(BaseModel):
ServiceProviders.DOGRAH,
ServiceProviders.AWS_BEDROCK,
ServiceProviders.SPEACHES,
ServiceProviders.OPENAI_REALTIME,
ServiceProviders.GOOGLE_REALTIME,
# ServiceProviders.SARVAM,
]
api_key: str | list[str]
@ -97,6 +102,7 @@ REGISTRY: Dict[ServiceType, Dict[str, Type[BaseServiceConfiguration]]] = {
ServiceType.TTS: {},
ServiceType.STT: {},
ServiceType.EMBEDDINGS: {},
ServiceType.REALTIME: {},
}
T = TypeVar("T", bound=BaseServiceConfiguration)
@ -279,6 +285,68 @@ class SpeachesLLMConfiguration(BaseLLMConfiguration):
api_key: str | list[str] | None = Field(default=None)
OPENAI_REALTIME_MODELS = ["gpt-4o-realtime-preview", "gpt-4o-mini-realtime-preview"]
OPENAI_REALTIME_VOICES = [
"alloy",
"ash",
"ballad",
"coral",
"echo",
"sage",
"shimmer",
"verse",
]
# @register_service(ServiceType.REALTIME)
# class OpenAIRealtimeLLMConfiguration(BaseLLMConfiguration):
# provider: Literal[ServiceProviders.OPENAI_REALTIME] = (
# ServiceProviders.OPENAI_REALTIME
# )
# model: str = Field(
# default="gpt-4o-realtime-preview",
# json_schema_extra={
# "examples": OPENAI_REALTIME_MODELS,
# "allow_custom_input": True,
# },
# )
# voice: str = Field(
# default="alloy",
# json_schema_extra={"examples": OPENAI_REALTIME_VOICES},
# )
GOOGLE_REALTIME_MODELS = ["gemini-3.1-flash-live-preview"]
GOOGLE_REALTIME_VOICES = ["Puck", "Charon", "Kore", "Fenrir", "Aoede"]
@register_service(ServiceType.REALTIME)
class GoogleRealtimeLLMConfiguration(BaseLLMConfiguration):
provider: Literal[ServiceProviders.GOOGLE_REALTIME] = (
ServiceProviders.GOOGLE_REALTIME
)
model: str = Field(
default="gemini-2.0-flash-live-001",
json_schema_extra={
"examples": GOOGLE_REALTIME_MODELS,
"allow_custom_input": True,
},
)
voice: str = Field(
default="Puck",
json_schema_extra={
"examples": GOOGLE_REALTIME_VOICES,
"allow_custom_input": True,
},
)
REALTIME_PROVIDERS = {
ServiceProviders.OPENAI_REALTIME.value,
ServiceProviders.GOOGLE_REALTIME.value,
}
LLMConfig = Annotated[
Union[
OpenAILLMService,
@ -293,6 +361,14 @@ LLMConfig = Annotated[
Field(discriminator="provider"),
]
RealtimeConfig = Annotated[
Union[
# OpenAIRealtimeLLMConfiguration,
GoogleRealtimeLLMConfiguration,
],
Field(discriminator="provider"),
]
###################################################### TTS ########################################################################
@ -719,6 +795,7 @@ SPEACHES_STT_MODELS = [
"Systran/faster-distil-whisper-small.en",
"Systran/faster-whisper-large-v3",
]
SPEACHES_STT_LANGUAGES = ["en", "ar", "nl", "fr", "de", "hi", "it", "pt", "es"]
@register_stt
@ -731,6 +808,13 @@ class SpeachesSTTConfiguration(BaseSTTConfiguration):
"allow_custom_input": True,
},
)
language: str = Field(
default="en",
json_schema_extra={
"examples": SPEACHES_STT_LANGUAGES,
"allow_custom_input": True,
},
)
base_url: str = Field(
default="http://localhost:8000/v1",
description="OpenAI-compatible STT endpoint (Speaches, etc.)",
@ -785,6 +869,6 @@ EmbeddingsConfig = Annotated[
]
ServiceConfig = Annotated[
Union[LLMConfig, TTSConfig, STTConfig, EmbeddingsConfig],
Union[LLMConfig, RealtimeConfig, TTSConfig, STTConfig, EmbeddingsConfig],
Field(discriminator="provider"),
]

View file

@ -97,6 +97,34 @@ def build_pipeline(
return Pipeline(processors)
def build_realtime_pipeline(
transport,
realtime_llm,
audio_buffer,
user_context_aggregator,
assistant_context_aggregator,
pipeline_engine_callback_processor,
pipeline_metrics_aggregator,
):
"""Build a pipeline for realtime (speech-to-speech) LLM services.
Realtime services (e.g. OpenAI Realtime, Gemini Live) handle STT+LLM+TTS
internally, so no separate STT or TTS processors are needed.
"""
processors = [
transport.input(),
user_context_aggregator,
realtime_llm,
pipeline_engine_callback_processor,
transport.output(),
audio_buffer,
assistant_context_aggregator,
pipeline_metrics_aggregator,
]
return Pipeline(processors)
def create_pipeline_task(pipeline, workflow_run_id, audio_config: AudioConfig = None):
"""Create a pipeline task with appropriate parameters"""
# Set up pipeline params with audio configuration if provided

View file

@ -16,6 +16,7 @@ from api.services.pipecat.event_handlers import (
from api.services.pipecat.in_memory_buffers import InMemoryLogsBuffer
from api.services.pipecat.pipeline_builder import (
build_pipeline,
build_realtime_pipeline,
create_pipeline_components,
create_pipeline_task,
)
@ -35,6 +36,7 @@ from api.services.pipecat.recording_router_processor import RecordingRouterProce
from api.services.pipecat.service_factory import (
create_llm_service,
create_llm_service_from_provider,
create_realtime_llm_service,
create_stt_service,
create_tts_service,
)
@ -603,10 +605,18 @@ async def _run_pipeline(
term.strip() for term in dictionary.split(",") if term.strip()
]
# Detect realtime mode (speech-to-speech services like OpenAI Realtime, Gemini Live)
is_realtime = user_config.is_realtime and user_config.realtime is not None
# Create services based on user configuration
stt = create_stt_service(user_config, audio_config, keyterms=keyterms)
tts = create_tts_service(user_config, audio_config)
llm = create_llm_service(user_config)
if is_realtime:
llm = create_realtime_llm_service(user_config, audio_config)
stt = None
tts = None
else:
stt = create_stt_service(user_config, audio_config, keyterms=keyterms)
tts = create_tts_service(user_config, audio_config)
llm = create_llm_service(user_config)
workflow_graph = WorkflowGraph(
ReactFlowDTO.model_validate(workflow.workflow_definition_with_fallback)
@ -694,46 +704,66 @@ async def _run_pipeline(
)
# Configure turn strategies based on STT provider, model, and workflow configuration
# Deepgram Flux uses external turn detection (VAD + External start/stop)
# Other models use configurable turn detection strategy
is_deepgram_flux = (
user_config.stt.provider == ServiceProviders.DEEPGRAM.value
and user_config.stt.model == "flux-general-en"
)
if is_realtime:
# Realtime services have server-side VAD/turn detection.
# For stop strategy, lets rely on SmartTurnAnalyzer which is
# enabled by default
user_turn_strategies = UserTurnStrategies(
start=[VADUserTurnStartStrategy()], stop=[]
)
if is_deepgram_flux:
user_turn_strategies = UserTurnStrategies(
start=[
VADUserTurnStartStrategy(),
ExternalUserTurnStartStrategy(enable_interruptions=True),
],
stop=[ExternalUserTurnStopStrategy()],
)
elif turn_stop_strategy == "turn_analyzer":
# Smart Turn Analyzer: best for longer responses with natural pauses
smart_turn_params = SmartTurnParams(stop_secs=smart_turn_stop_secs)
user_turn_strategies = UserTurnStrategies(
start=[VADUserTurnStartStrategy(), TranscriptionUserTurnStartStrategy()],
stop=[
TurnAnalyzerUserTurnStopStrategy(
turn_analyzer=LocalSmartTurnAnalyzerV3(params=smart_turn_params)
)
],
)
# Lets not start the pipeline as muted for Realtime
# - CallbackUserMuteStrategy: mutes based on engine's _mute_pipeline state
user_mute_strategies = [
FunctionCallUserMuteStrategy(),
CallbackUserMuteStrategy(should_mute_callback=engine.should_mute_user),
]
else:
# Transcription-based (default): best for short 1-2 word responses
user_turn_strategies = UserTurnStrategies(
start=[VADUserTurnStartStrategy(), TranscriptionUserTurnStartStrategy()],
stop=[SpeechTimeoutUserTurnStopStrategy()],
# Deepgram Flux uses external turn detection (VAD + External start/stop)
# Other models use configurable turn detection strategy
is_deepgram_flux = (
user_config.stt.provider == ServiceProviders.DEEPGRAM.value
and user_config.stt.model == "flux-general-en"
)
# Create user mute strategies
# - CallbackUserMuteStrategy: mutes based on engine's _mute_pipeline state
user_mute_strategies = [
MuteUntilFirstBotCompleteUserMuteStrategy(),
FunctionCallUserMuteStrategy(),
CallbackUserMuteStrategy(should_mute_callback=engine.should_mute_user),
]
if is_deepgram_flux:
user_turn_strategies = UserTurnStrategies(
start=[
VADUserTurnStartStrategy(),
ExternalUserTurnStartStrategy(enable_interruptions=True),
],
stop=[ExternalUserTurnStopStrategy()],
)
elif turn_stop_strategy == "turn_analyzer":
# Smart Turn Analyzer: best for longer responses with natural pauses
smart_turn_params = SmartTurnParams(stop_secs=smart_turn_stop_secs)
user_turn_strategies = UserTurnStrategies(
start=[
VADUserTurnStartStrategy(),
TranscriptionUserTurnStartStrategy(),
],
stop=[
TurnAnalyzerUserTurnStopStrategy(
turn_analyzer=LocalSmartTurnAnalyzerV3(params=smart_turn_params)
)
],
)
else:
# Transcription-based (default): best for short 1-2 word responses
user_turn_strategies = UserTurnStrategies(
start=[
VADUserTurnStartStrategy(),
TranscriptionUserTurnStartStrategy(),
],
stop=[SpeechTimeoutUserTurnStopStrategy()],
)
# - CallbackUserMuteStrategy: mutes based on engine's _mute_pipeline state
user_mute_strategies = [
MuteUntilFirstBotCompleteUserMuteStrategy(),
FunctionCallUserMuteStrategy(),
CallbackUserMuteStrategy(should_mute_callback=engine.should_mute_user),
]
user_params = LLMUserAggregatorParams(
user_turn_strategies=user_turn_strategies,
@ -769,77 +799,93 @@ async def _run_pipeline(
async def on_user_turn_started(aggregator, strategy):
user_idle_handler.reset()
# Create voicemail detector if enabled in workflow configurations
# Voicemail detection and recording router are not supported in realtime mode
voicemail_detector = None
voicemail_config = (workflow.workflow_configurations or {}).get(
"voicemail_detection", {}
)
if voicemail_config.get("enabled", False):
logger.info(f"Voicemail detection enabled for workflow run {workflow_run_id}")
# Create a separate LLM instance for the voicemail sub-pipeline
# (can't share with main pipeline as it would mess up frame linking)
if voicemail_config.get("use_workflow_llm", True):
voicemail_llm = create_llm_service(user_config)
else:
voicemail_llm = create_llm_service_from_provider(
provider=voicemail_config.get("provider", "openai"),
model=voicemail_config.get("model", "gpt-4.1"),
api_key=voicemail_config.get("api_key", ""),
)
long_speech_timeout = voicemail_config.get("long_speech_timeout", 8.0)
custom_system_prompt = voicemail_config.get("system_prompt") or None
voicemail_detector = VoicemailDetector(
llm=voicemail_llm,
long_speech_timeout=long_speech_timeout,
custom_system_prompt=custom_system_prompt,
)
# Register event handler to end task when voicemail is detected
@voicemail_detector.event_handler("on_voicemail_detected")
async def _on_voicemail_detected(_processor):
logger.info(f"Voicemail detected for workflow run {workflow_run_id}")
await engine.end_call_with_reason(
reason=EndTaskReason.VOICEMAIL_DETECTED.value,
abort_immediately=True,
)
# Create recording router if workflow has active recordings
recording_router = None
if has_recordings:
fetch_audio = create_recording_audio_fetcher(
organization_id=workflow.organization_id,
pipeline_sample_rate=audio_config.pipeline_sample_rate,
if not is_realtime:
# Create voicemail detector if enabled in workflow configurations
voicemail_config = (workflow.workflow_configurations or {}).get(
"voicemail_detection", {}
)
recording_router = RecordingRouterProcessor(
audio_sample_rate=audio_config.pipeline_sample_rate,
fetch_recording_audio=fetch_audio,
)
# Warm the recording cache in the background so audio is ready
# before the first playback request.
asyncio.create_task(
warm_recording_cache(
workflow_id=workflow_id,
if voicemail_config.get("enabled", False):
logger.info(
f"Voicemail detection enabled for workflow run {workflow_run_id}"
)
# Create a separate LLM instance for the voicemail sub-pipeline
# (can't share with main pipeline as it would mess up frame linking)
if voicemail_config.get("use_workflow_llm", True):
voicemail_llm = create_llm_service(user_config)
else:
voicemail_llm = create_llm_service_from_provider(
provider=voicemail_config.get("provider", "openai"),
model=voicemail_config.get("model", "gpt-4.1"),
api_key=voicemail_config.get("api_key", ""),
)
long_speech_timeout = voicemail_config.get("long_speech_timeout", 8.0)
custom_system_prompt = voicemail_config.get("system_prompt") or None
voicemail_detector = VoicemailDetector(
llm=voicemail_llm,
long_speech_timeout=long_speech_timeout,
custom_system_prompt=custom_system_prompt,
)
# Register event handler to end task when voicemail is detected
@voicemail_detector.event_handler("on_voicemail_detected")
async def _on_voicemail_detected(_processor):
logger.info(f"Voicemail detected for workflow run {workflow_run_id}")
await engine.end_call_with_reason(
reason=EndTaskReason.VOICEMAIL_DETECTED.value,
abort_immediately=True,
)
# Create recording router if workflow has active recordings
if has_recordings:
fetch_audio = create_recording_audio_fetcher(
organization_id=workflow.organization_id,
pipeline_sample_rate=audio_config.pipeline_sample_rate,
)
)
recording_router = RecordingRouterProcessor(
audio_sample_rate=audio_config.pipeline_sample_rate,
fetch_recording_audio=fetch_audio,
)
# Warm the recording cache in the background so audio is ready
# before the first playback request.
asyncio.create_task(
warm_recording_cache(
workflow_id=workflow_id,
organization_id=workflow.organization_id,
pipeline_sample_rate=audio_config.pipeline_sample_rate,
)
)
# Build the pipeline with the STT mute filter and context controller
pipeline = build_pipeline(
transport,
stt,
audio_buffer,
llm,
tts,
user_context_aggregator,
assistant_context_aggregator,
pipeline_engine_callback_processor,
pipeline_metrics_aggregator,
voicemail_detector=voicemail_detector,
recording_router=recording_router,
)
# Build the pipeline
if is_realtime:
pipeline = build_realtime_pipeline(
transport,
llm,
audio_buffer,
user_context_aggregator,
assistant_context_aggregator,
pipeline_engine_callback_processor,
pipeline_metrics_aggregator,
)
else:
pipeline = build_pipeline(
transport,
stt,
audio_buffer,
llm,
tts,
user_context_aggregator,
assistant_context_aggregator,
pipeline_engine_callback_processor,
pipeline_metrics_aggregator,
voicemail_detector=voicemail_detector,
recording_router=recording_router,
)
# Create pipeline task with audio configuration
task = create_pipeline_task(pipeline, workflow_run_id, audio_config)
@ -847,7 +893,8 @@ async def _run_pipeline(
# Now set the task on the engine
engine.set_task(task)
# Initialize the engine to set the initial context
# Initialize the engine to set the initial context with
# System Prompt and Tools
await engine.initialize()
# Add real-time feedback observer (always logs to buffer, streams to WS if available)

View file

@ -35,6 +35,7 @@ from pipecat.services.openai.tts import OpenAITTSService, OpenAITTSSettings
from pipecat.services.openrouter.llm import OpenRouterLLMService, OpenRouterLLMSettings
from pipecat.services.sarvam.stt import SarvamSTTService, SarvamSTTSettings
from pipecat.services.sarvam.tts import SarvamTTSService, SarvamTTSSettings
from pipecat.services.speaches.llm import SpeachesLLMService, SpeachesLLMSettings
from pipecat.services.speaches.stt import SpeachesSTTService, SpeachesSTTSettings
from pipecat.services.speaches.tts import SpeachesTTSService, SpeachesTTSSettings
from pipecat.services.speechmatics.stt import (
@ -63,7 +64,6 @@ def create_stt_service(
if user_config.stt.provider == ServiceProviders.DEEPGRAM.value:
# Check if using Flux model (English-only, no language selection)
if user_config.stt.model == "flux-general-en":
logger.debug("Using DeepGram Flux Model")
return DeepgramFluxSTTService(
api_key=user_config.stt.api_key,
settings=DeepgramFluxSTTSettings(
@ -395,15 +395,75 @@ def create_llm_service_from_provider(
settings=AWSBedrockLLMSettings(model=model),
)
elif provider == ServiceProviders.SPEACHES.value:
return OpenAILLMService(
return SpeachesLLMService(
base_url=base_url or "http://localhost:11434/v1",
api_key=api_key or "none",
settings=OpenAILLMSettings(model=model),
settings=SpeachesLLMSettings(model=model),
)
else:
raise HTTPException(status_code=400, detail=f"Invalid LLM provider {provider}")
def create_realtime_llm_service(user_config, audio_config: "AudioConfig"):
"""Create a realtime (speech-to-speech) LLM service that handles STT+LLM+TTS.
These services bypass separate STT/TTS and handle audio directly via
a bidirectional WebSocket connection. Reads from user_config.realtime.
"""
realtime_config = user_config.realtime
provider = realtime_config.provider
model = realtime_config.model
api_key = realtime_config.api_key
voice = getattr(realtime_config, "voice", None)
logger.info(
f"Creating realtime LLM service: provider={provider}, model={model}, voice={voice}"
)
if provider == ServiceProviders.OPENAI_REALTIME.value:
from pipecat.services.openai.realtime.events import (
AudioConfiguration,
AudioInput,
AudioOutput,
InputAudioTranscription,
SessionProperties,
)
from pipecat.services.openai.realtime.llm import OpenAIRealtimeLLMService
return OpenAIRealtimeLLMService(
api_key=api_key,
settings=OpenAIRealtimeLLMService.Settings(
model=model,
session_properties=SessionProperties(
audio=AudioConfiguration(
input=AudioInput(
transcription=InputAudioTranscription(),
),
output=AudioOutput(
voice=voice or "alloy",
),
),
),
),
)
elif provider == ServiceProviders.GOOGLE_REALTIME.value:
from pipecat.services.google.gemini_live.llm import GeminiLiveLLMService
# Gemini Live enables input/output audio transcription by default
# in its _connect() method — no need to configure it explicitly.
return GeminiLiveLLMService(
api_key=api_key,
settings=GeminiLiveLLMService.Settings(
model=model,
voice=voice or "Puck", # vad=GeminiVADParams(disabled=True)
),
)
else:
raise HTTPException(
status_code=400, detail=f"Invalid realtime LLM provider {provider}"
)
def create_llm_service(user_config):
"""Create and return appropriate LLM service based on user configuration."""
provider = user_config.llm.provider

View file

@ -210,12 +210,17 @@ class PipecatEngine:
async def _update_llm_context(self, system_prompt: str, functions: list[dict]):
"""Update LLM settings with the composed system prompt and tool list."""
await self.llm._update_settings(LLMSettings(system_instruction=system_prompt))
if functions:
tools_schema = ToolsSchema(standard_tools=functions)
self.context.set_tools(tools_schema)
await self.llm._update_settings(LLMSettings(system_instruction=system_prompt))
# For Gemini Live, set context on the LLM before _update_settings so that
# _connect (triggered by reconnect) can read tools from it.
if hasattr(self.llm, "_context") and not self.llm._context and self.context:
self.llm._context = self.context
def _format_prompt(self, prompt: str) -> str:
"""Delegate prompt formatting to the shared workflow.utils implementation."""

View file

@ -215,13 +215,17 @@ class VariableExtractionManager:
with tracer.start_as_current_span(
"llm-variable-extraction", context=parent_ctx
) as span:
tracing_messages = [
{"role": "system", "content": system_prompt},
*extraction_messages,
]
add_llm_span_attributes(
span,
service_name=self._engine.llm.__class__.__name__,
model=model_name,
operation_name="llm-variable-extraction",
messages=extraction_messages,
output=llm_response,
messages=tracing_messages,
output=json.dumps({"content": llm_response}),
stream=False,
parameters={},
)

View file

@ -59,7 +59,14 @@ async def _generate_conversation_summary(
)
span_name = f"conversation-summary-before-{node_name}"
add_qa_span_to_trace(parent_ctx, model, messages, summary, span_name)
add_qa_span_to_trace(
parent_ctx,
model,
messages,
summary,
span_name,
CONVERSATION_SUMMARY_SYSTEM_PROMPT,
)
return summary
except Exception as e:
@ -189,7 +196,9 @@ async def run_per_node_qa_analysis(
# Trace
span_name = f"qa-node-{node_name}"
add_qa_span_to_trace(parent_ctx, model, messages, raw_response, span_name)
add_qa_span_to_trace(
parent_ctx, model, messages, raw_response, span_name, system_content
)
# Parse response
node_result: dict[str, Any] = {
@ -299,7 +308,9 @@ async def _run_whole_call_qa_analysis(
# Langfuse tracing
parent_ctx = setup_langfuse_parent_context(workflow_run)
add_qa_span_to_trace(parent_ctx, model, messages, raw_response, "qa-analysis")
add_qa_span_to_trace(
parent_ctx, model, messages, raw_response, "qa-analysis", system_content
)
return {
"node_results": {"whole_call": node_result},

View file

@ -1,5 +1,7 @@
"""LLM configuration resolution and token usage accumulation."""
import random
from api.db import db_client
from api.db.models import WorkflowRunModel
@ -57,6 +59,8 @@ async def resolve_user_llm_config(
provider = llm_config.get("provider", "openai")
api_key = llm_config.get("api_key", "")
if isinstance(api_key, list):
api_key = random.choice(api_key)
model = llm_config.get("model", "gpt-4.1")
kwargs = {}

View file

@ -166,7 +166,9 @@ async def ensure_node_summaries(
continue
# Create a Langfuse trace for this summary generation
trace_url = create_node_summary_trace(model, messages, summary_text, node_name)
trace_url = create_node_summary_trace(
model, messages, summary_text, node_name, NODE_SUMMARY_SYSTEM_PROMPT
)
entry: dict[str, Any] = {"summary": summary_text}
if trace_url:

View file

@ -1,5 +1,6 @@
"""Langfuse / OpenTelemetry tracing helpers for QA analysis."""
import json
import re
from loguru import logger
@ -70,6 +71,7 @@ def add_qa_span_to_trace(
messages: list[dict],
output: str,
span_name: str,
system_prompt: str = "",
) -> None:
"""Create a child span under the conversation trace."""
if parent_ctx is None:
@ -84,13 +86,21 @@ def add_qa_span_to_trace(
span_name,
context=parent_ctx,
) as span:
tracing_messages = (
[
{"role": "system", "content": system_prompt},
*messages,
]
if system_prompt
else messages
)
add_llm_span_attributes(
span,
service_name="OpenAILLMService",
model=model,
operation_name=span_name,
messages=messages,
output=output,
messages=tracing_messages,
output=json.dumps({"content": output}),
stream=False,
parameters={"temperature": 0},
)
@ -103,6 +113,7 @@ def create_node_summary_trace(
messages: list[dict],
output: str,
node_name: str,
system_prompt: str = "",
) -> str | None:
"""Create a standalone Langfuse trace for a node summary generation.
@ -125,13 +136,21 @@ def create_node_summary_trace(
f"node-summary-{node_name}",
context=Context(),
) as span:
tracing_messages = (
[
{"role": "system", "content": system_prompt},
*messages,
]
if system_prompt
else messages
)
add_llm_span_attributes(
span,
service_name="OpenAILLMService",
model=model,
operation_name=f"node-summary-{node_name}",
messages=messages,
output=output,
messages=tracing_messages,
output=json.dumps({"content": output}),
stream=False,
parameters={"temperature": 0},
)