fix: fix interruption handling for Gemini Live

1. Fixes #236
2. Fix run_inference for variable extraction for Gemini Live
This commit is contained in:
Abhishek Kumar 2026-04-15 19:29:07 +05:30
parent 14e6f29f2f
commit e31b38122e
12 changed files with 48 additions and 15 deletions

View file

@ -120,9 +120,21 @@ class InMemoryLogsBuffer:
f"Incremented turn counter to {self._turn_counter} for workflow {self._workflow_run_id}"
)
@staticmethod
def _event_sort_key(event: dict) -> str:
payload_ts = event.get("payload", {}).get("timestamp")
return payload_ts or event.get("timestamp", "")
def _sorted_events(self) -> List[dict]:
# Stable sort by the realtime (payload) timestamp when available, falling
# back to the buffer-append timestamp. Python's sort is stable, so events
# sharing a key retain their original insertion order — this keeps
# consecutive bot-text chunks of a single turn contiguous.
return sorted(self._events, key=self._event_sort_key)
def get_events(self) -> List[dict]:
"""Get all events for final storage."""
return self._events
"""Get all events for final storage, ordered by realtime timestamp."""
return self._sorted_events()
def contains_user_speech(self) -> bool:
"""Return True if any final user transcription event has non-empty text."""
@ -141,7 +153,7 @@ class InMemoryLogsBuffer:
Filters for rtf-user-transcription (final) and rtf-bot-text events,
formats them as '[timestamp] user/assistant: text\\n'.
"""
return _generate_transcript_text(self._events)
return _generate_transcript_text(self._sorted_events())
def write_transcript_to_temp_file(self) -> Optional[str]:
"""Write transcript to a temporary text file and return the path.

View file

@ -616,10 +616,15 @@ async def _run_pipeline(
llm = create_realtime_llm_service(user_config, audio_config)
stt = None
tts = None
# Realtime services don't implement run_inference, so create a
# separate text LLM for variable extraction and other out-of-band
# inference calls.
inference_llm = create_llm_service(user_config)
else:
stt = create_stt_service(user_config, audio_config, keyterms=keyterms)
tts = create_tts_service(user_config, audio_config)
llm = create_llm_service(user_config)
inference_llm = None
workflow_graph = WorkflowGraph(ReactFlowDTO.model_validate(run_workflow_json))
@ -703,9 +708,15 @@ async def _run_pipeline(
context_compaction_enabled = (workflow.workflow_configurations or {}).get(
"context_compaction_enabled", False
)
# Context compaction doesn't apply in realtime mode: the speech-to-speech
# service manages its own conversation state server-side.
if is_realtime and context_compaction_enabled:
logger.info("Disabling context_compaction_enabled for realtime workflow run")
context_compaction_enabled = False
engine = PipecatEngine(
llm=llm,
inference_llm=inference_llm,
workflow=workflow_graph,
call_context_vars=merged_call_context_vars,
workflow_run_id=workflow_run_id,

View file

@ -60,6 +60,7 @@ class PipecatEngine:
*,
task: Optional[PipelineTask] = None,
llm: Optional["LLMService"] = None,
inference_llm: Optional["LLMService"] = None,
context: Optional[LLMContext] = None,
workflow: WorkflowGraph,
call_context_vars: dict,
@ -75,6 +76,12 @@ class PipecatEngine:
):
self.task = task
self.llm = llm
# LLM used for out-of-band inference (variable extraction, context
# summarization). Falls back to the pipeline LLM when not provided.
# In realtime mode the pipeline LLM is a speech-to-speech service
# that does not implement run_inference, so a separate text LLM
# must be passed in.
self.inference_llm = inference_llm or llm
self.context = context
self.workflow = workflow
self._call_context_vars = call_context_vars

View file

@ -63,7 +63,7 @@ class ContextSummarizationManager:
orphaned tool calls from previous nodes) with a concise summary.
"""
context = self._engine.context
llm = self._engine.llm
llm = self._engine.inference_llm
current_node = self._engine._current_node
try:

View file

@ -203,12 +203,12 @@ class VariableExtractionManager:
# current node's system prompt that build_chat_completion_params
# would otherwise prepend.
# ------------------------------------------------------------------
llm_response = await self._engine.llm.run_inference(
llm_response = await self._engine.inference_llm.run_inference(
extraction_context, system_instruction=system_prompt
)
# Get model name for tracing
model_name = getattr(self._engine.llm, "model_name", "unknown")
model_name = getattr(self._engine.inference_llm, "model_name", "unknown")
if ensure_tracing():
tracer = trace.get_tracer("pipecat")
@ -221,7 +221,7 @@ class VariableExtractionManager:
]
add_llm_span_attributes(
span,
service_name=self._engine.llm.__class__.__name__,
service_name=self._engine.inference_llm.__class__.__name__,
model=model_name,
operation_name="llm-variable-extraction",
messages=tracing_messages,

View file

@ -73,7 +73,7 @@ For example, if you only want to change the voice for a specific agent:
You can also switch an individual agent to use a **Realtime** provider (such as Gemini Live) even if the global configuration uses standard LLM + TTS + STT. Toggle the **Realtime** switch in the Model Overrides tab, then configure the realtime provider, model, and voice.
<Note>
When an agent uses a Realtime provider, it replaces the separate LLM, TTS, and STT services with a single speech-to-speech model. The individual LLM/TTS/STT override tabs are hidden in this mode.
When an agent uses a Realtime provider, it replaces the separate TTS and STT services with a single speech-to-speech model. An **LLM** is still required alongside the Realtime model — it's used for out-of-band tasks like variable extraction and QA analysis, which the realtime service does not handle. Context compaction is not applicable in Realtime mode and is ignored if enabled.
</Note>
## Gemini 3.1 Live
@ -119,5 +119,5 @@ To use Gemini 3.1 Live with Dograh, you need a Google Gemini API key. Follow the
6. Select the language (currently `en` is supported).
<Note>
When using a Realtime provider like Gemini Live, you do not need to configure separate LLM, TTS, and STT services — the realtime model handles all three.
When using a Realtime provider like Gemini Live, you do not need to configure separate TTS and STT services — the realtime model handles speech in and out. However, you **must** still configure an **LLM** under the LLM tab: it powers variable extraction and QA analysis, which the realtime service does not perform.
</Note>

@ -1 +1 @@
Subproject commit 49f1965d652fb5027968dcc24677bc83c5f905ab
Subproject commit edefaad42b97e52a3ad5eef8d15115a5c6ba3b11

View file

@ -1,2 +1,3 @@
BACKEND_URL=http://localhost:8000
NEXT_PUBLIC_BACKEND_URL=http://localhost:8000
NEXT_PUBLIC_NODE_ENV=development

View file

@ -385,7 +385,7 @@ export const WorkflowEditorHeader = ({
{/* GitHub star badge - desktop only */}
<div className="hidden md:block">
<GitHubStarBadge className="border-[#3a3a3a] text-white" source="workflow_editor_header" />
<GitHubStarBadge className="border-[#3a3a3a] bg-[#2a2a2a] text-white [&_span]:bg-transparent" source="workflow_editor_header" />
</div>
</div>
</div>

View file

@ -84,7 +84,7 @@ export const UnifiedTranscript = ({
}
return (
<TranscriptMessage
key={msg.id}
key={`${msg.id}-${index}`}
message={msg}
nextMessage={transcriptMessages[index + 1]}
/>

View file

@ -582,7 +582,7 @@ function GeneralSection({
<div>
<h3 className="text-sm font-medium">Context Compaction</h3>
<p className="text-xs text-muted-foreground mt-0.5">
Automatically summarize conversation context when transitioning between nodes.
Automatically summarize conversation context when transitioning between nodes. Not applicable in Realtime mode the speech-to-speech service manages its own conversation state and this setting is ignored.
</p>
</div>
<div className="flex items-center justify-between">

View file

@ -52,6 +52,7 @@ const STANDARD_TABS: { key: ServiceSegment; label: string }[] = [
const REALTIME_TABS: { key: ServiceSegment; label: string }[] = [
{ key: "realtime", label: "Realtime Model" },
{ key: "llm", label: "LLM" },
{ key: "embeddings", label: "Embedding" },
];
@ -63,6 +64,7 @@ const OVERRIDE_STANDARD_TABS: { key: ServiceSegment; label: string }[] = [
const OVERRIDE_REALTIME_TABS: { key: ServiceSegment; label: string }[] = [
{ key: "realtime", label: "Realtime Model" },
{ key: "llm", label: "LLM" },
];
// Display names for Sarvam voices
@ -407,7 +409,7 @@ export function ServiceConfigurationForm({
if (mode === 'override') {
// Build model_overrides for enabled services only
const modelOverrides: Record<string, unknown> = {};
const services = isRealtime ? ["realtime"] : ["llm", "tts", "stt"];
const services = isRealtime ? ["realtime", "llm"] : ["llm", "tts", "stt"];
for (const svc of services) {
if (enabledOverrides[svc]) {
modelOverrides[svc] = buildServiceConfig(svc as ServiceSegment, data);
@ -758,7 +760,7 @@ export function ServiceConfigurationForm({
Realtime Mode
</Label>
<p className="text-xs text-muted-foreground mt-0.5">
Uses a single speech-to-speech model (no separate STT/TTS)
Uses a single speech-to-speech model (no separate STT/TTS). An LLM is still required for variable extraction and QA.
</p>
</div>
<Switch