feat: add hybrid text + recording functionality in agents (#191)

* feat: add recording feature in agents

* chore: pin pipecat version

* feat: show usage in UI

* chore: update pipecat
This commit is contained in:
Abhishek 2026-03-16 15:04:08 +05:30 committed by GitHub
parent f075bcb623
commit 494c60d774
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
43 changed files with 2865 additions and 397 deletions

View file

@ -5,6 +5,7 @@ from api.services.workflow.disposition_mapper import (
get_organization_id_from_workflow_run,
)
from api.services.workflow.workflow import Node, WorkflowGraph
from pipecat.adapters.schemas.tools_schema import ToolsSchema
from pipecat.frames.frames import (
BotStartedSpeakingFrame,
BotStoppedSpeakingFrame,
@ -16,6 +17,7 @@ from pipecat.frames.frames import (
from pipecat.pipeline.task import PipelineTask
from pipecat.processors.aggregators.llm_context import LLMContext
from pipecat.services.llm_service import FunctionCallParams
from pipecat.services.settings import LLMSettings
from pipecat.utils.enums import EndTaskReason
if TYPE_CHECKING:
@ -31,18 +33,19 @@ import asyncio
from loguru import logger
from api.services.workflow import pipecat_engine_callbacks as engine_callbacks
from api.services.workflow.pipecat_engine_custom_tools import CustomToolManager
from api.services.workflow.pipecat_engine_utils import (
from api.services.workflow.pipecat_engine_context_composer import (
compose_functions_for_node,
compose_system_prompt_for_node,
)
from api.services.workflow.pipecat_engine_custom_tools import (
CustomToolManager,
get_function_schema,
render_template,
update_llm_context,
)
from api.services.workflow.pipecat_engine_variable_extractor import (
VariableExtractionManager,
)
from api.services.workflow.tools.calculator import get_calculator_tools, safe_calculator
from api.services.workflow.tools.knowledge_base import (
get_knowledge_base_tool,
retrieve_from_knowledge_base,
)
from api.services.workflow.tools.timezone import (
@ -50,6 +53,7 @@ from api.services.workflow.tools.timezone import (
get_current_time,
get_time_tools,
)
from api.utils.template_renderer import render_template
class PipecatEngine:
@ -68,6 +72,7 @@ class PipecatEngine:
embeddings_api_key: Optional[str] = None,
embeddings_model: Optional[str] = None,
embeddings_base_url: Optional[str] = None,
has_recordings: bool = False,
):
self.task = task
self.llm = llm
@ -113,6 +118,10 @@ class PipecatEngine:
# Audio configuration (set via set_audio_config from _run_pipeline)
self._audio_config = None
# True when the workflow has active recordings; enables recording
# response mode instructions on all nodes for in-context learning.
self._has_recordings: bool = has_recordings
async def _get_organization_id(self) -> Optional[int]:
"""Get and cache the organization ID from workflow run."""
if self._custom_tool_manager:
@ -194,15 +203,14 @@ class PipecatEngine:
logger.error(f"Error initializing {self.__class__.__name__}: {e}")
raise
def _get_function_schema(self, function_name: str, description: str):
"""Thin wrapper around utils.get_function_schema for backwards compatibility."""
async def _update_llm_context(self, system_prompt: str, functions: list[dict]):
"""Update LLM settings with the composed system prompt and tool list."""
return get_function_schema(function_name, description)
await self.llm._update_settings(LLMSettings(system_instruction=system_prompt))
async def _update_llm_context(self, system_message: dict, functions: list[dict]):
"""Delegate context update to the shared workflow.utils implementation."""
update_llm_context(self.context, system_message, functions)
if functions:
tools_schema = ToolsSchema(standard_tools=functions)
self.context.set_tools(tools_schema)
def _format_prompt(self, prompt: str) -> str:
"""Delegate prompt formatting to the shared workflow.utils implementation."""
@ -473,12 +481,19 @@ class PipecatEngine:
if node.document_uuids:
await self._register_knowledge_base_function(node.document_uuids)
# Set up system message and functions
(
system_message,
functions,
) = await self._compose_system_message_functions_for_node(node)
await self._update_llm_context(system_message, functions)
# Compose prompt and functions via the context composer module
system_prompt = compose_system_prompt_for_node(
node=node,
workflow=self.workflow,
format_prompt=self._format_prompt,
has_recordings=self._has_recordings,
)
functions = await compose_functions_for_node(
node=node,
builtin_function_schemas=self.builtin_function_schemas,
custom_tool_manager=self._custom_tool_manager,
)
await self._update_llm_context(system_prompt, functions)
async def set_node(self, node_id: str):
"""
@ -610,62 +625,6 @@ class PipecatEngine:
)
await self.task.queue_frame(frame_to_push)
async def _compose_system_message_functions_for_node(
self, node: "Node"
) -> tuple[list[dict], list[dict]]:
"""Generate the system messages and function schemas for the given node.
This performs the same formatting logic used when entering a node but
does **not** register the functions with the LLM; callers are
responsible for that.
"""
global_prompt = ""
if self.workflow.global_node_id and node.add_global_prompt:
global_node = self.workflow.nodes[self.workflow.global_node_id]
global_prompt = self._format_prompt(global_node.prompt)
functions: list[dict] = []
# Add built-in function schemas (calculator and timezone tools)
functions.extend(self.builtin_function_schemas)
# Add knowledge base retrieval tool if node has documents
if node.document_uuids:
kb_tool_def = get_knowledge_base_tool(node.document_uuids)
kb_schema = get_function_schema(
kb_tool_def["function"]["name"],
kb_tool_def["function"]["description"],
properties=kb_tool_def["function"]["parameters"].get("properties", {}),
required=kb_tool_def["function"]["parameters"].get("required", []),
)
functions.append(kb_schema)
# Add custom tools from node.tool_uuids
if node.tool_uuids and self._custom_tool_manager:
custom_tool_schemas = await self._custom_tool_manager.get_tool_schemas(
node.tool_uuids
)
functions.extend(custom_tool_schemas)
# Transition functions (schema only; registration handled elsewhere)
for outgoing_edge in node.out_edges:
function_schema = self._get_function_schema(
outgoing_edge.get_function_name(), outgoing_edge.condition
)
functions.append(function_schema)
formatted_node_prompt = self._format_prompt(node.prompt)
system_message = {
"role": "system",
"content": "\n\n".join(
p for p in (global_prompt, formatted_node_prompt) if p
),
}
return system_message, functions
async def should_mute_user(self, frame: "Frame") -> bool:
"""
Callback for CallbackUserMuteStrategy to determine if the user should be muted.

View file

@ -0,0 +1,138 @@
"""System prompt and function schema composition for PipecatEngine nodes.
Extracts prompt and function composition logic from PipecatEngine into
reusable functions. Defines recording response mode markers and instructions.
"""
from typing import TYPE_CHECKING, Callable, Optional
if TYPE_CHECKING:
from api.services.workflow.pipecat_engine_custom_tools import CustomToolManager
from api.services.workflow.workflow import Node, WorkflowGraph
from api.services.workflow.pipecat_engine_custom_tools import get_function_schema
from api.services.workflow.tools.knowledge_base import get_knowledge_base_tool
# ---------------------------------------------------------------------------
# Recording response mode markers
# ---------------------------------------------------------------------------
RECORDING_MARKER = "" # Play pre-recorded audio
TTS_MARKER = "" # Generate dynamic TTS text
# ---------------------------------------------------------------------------
# Recording response mode system prompt instructions
# ---------------------------------------------------------------------------
RECORDING_RESPONSE_MODE_INSTRUCTIONS = """\
RESPONSE MODE INSTRUCTIONS - MANDATORY FORMAT:
Every response you generate MUST begin with a response mode indicator.
You have two modes for responding:
1. DYNAMIC SPEECH (): Generate text that will be converted to speech by TTS.
Format: `` followed by a space and your full spoken response.
Example: Hello! How can I help you today?
2. PRE-RECORDED AUDIO (): Play a pre-recorded audio message.
Format: `` followed by a space and ONLY the recording_id. Nothing else.
Example: rec_greeting_01
RULES:
- Your response MUST start with either `` or `` as the very first character.
- For `` (dynamic speech): Follow with a space and your full response text.
- For `` (pre-recorded audio): Follow with a space and ONLY the recording_id. No other text.
- Use `` when a pre-recorded message matches the situation well.
- Use `` when you need to generate a dynamic, contextual response.
- NEVER mix modes in a single response. Choose one."""
def compose_system_prompt_for_node(
*,
node: "Node",
workflow: "WorkflowGraph",
format_prompt: Callable[[str], str],
has_recordings: bool,
) -> str:
"""Compose the full system prompt text for a workflow node.
Combines the global prompt, node-specific prompt, and (when recordings
are enabled anywhere in the workflow) the recording response mode
instructions into a single string.
Args:
node: The workflow node to compose the prompt for.
workflow: The full workflow graph (needed for global node prompt).
format_prompt: Callable to render template variables in prompts.
has_recordings: Whether any node in the workflow uses recordings.
Returns:
The composed system prompt text.
"""
global_prompt = ""
if workflow.global_node_id and node.add_global_prompt:
global_node = workflow.nodes[workflow.global_node_id]
global_prompt = format_prompt(global_node.prompt)
formatted_node_prompt = format_prompt(node.prompt)
parts = [p for p in (global_prompt, formatted_node_prompt) if p]
if has_recordings:
parts.append(RECORDING_RESPONSE_MODE_INSTRUCTIONS)
# TODO: Append per-node available recordings list here once
# Node.recording_ids is populated. The list should include
# recording_id and a short description so the LLM can choose.
return "\n\n".join(parts)
async def compose_functions_for_node(
*,
node: "Node",
builtin_function_schemas: list[dict],
custom_tool_manager: Optional["CustomToolManager"],
) -> list[dict]:
"""Compose the function/tool schemas for a workflow node.
Gathers built-in tools, knowledge-base tools, custom tools,
and transition function schemas into a single list.
Args:
node: The workflow node to compose functions for.
builtin_function_schemas: Pre-computed schemas for built-in tools.
custom_tool_manager: Manager for user-defined custom tools (may be None).
Returns:
A list of function schemas to register with the LLM.
"""
functions: list[dict] = []
# Built-in tools (calculator, timezone)
functions.extend(builtin_function_schemas)
# Knowledge base retrieval tool
if node.document_uuids:
kb_tool_def = get_knowledge_base_tool(node.document_uuids)
kb_schema = get_function_schema(
kb_tool_def["function"]["name"],
kb_tool_def["function"]["description"],
properties=kb_tool_def["function"]["parameters"].get("properties", {}),
required=kb_tool_def["function"]["parameters"].get("required", []),
)
functions.append(kb_schema)
# Custom tools
if node.tool_uuids and custom_tool_manager:
custom_tool_schemas = await custom_tool_manager.get_tool_schemas(
node.tool_uuids
)
functions.extend(custom_tool_schemas)
# Transition function schemas
for outgoing_edge in node.out_edges:
function_schema = get_function_schema(
outgoing_edge.get_function_name(), outgoing_edge.condition
)
functions.append(function_schema)
return functions

View file

@ -10,7 +10,7 @@ import asyncio
import re
import time
import uuid
from typing import TYPE_CHECKING, Any, Optional
from typing import TYPE_CHECKING, Any, Dict, List, Optional
from loguru import logger
@ -23,7 +23,6 @@ from api.services.telephony.transfer_event_protocol import TransferContext
from api.services.workflow.disposition_mapper import (
get_organization_id_from_workflow_run,
)
from api.services.workflow.pipecat_engine_utils import get_function_schema
from api.services.workflow.tools.custom_tool import (
execute_http_tool,
tool_to_function_schema,
@ -42,6 +41,29 @@ if TYPE_CHECKING:
from api.services.workflow.pipecat_engine import PipecatEngine
def get_function_schema(
function_name: str,
description: str,
*,
properties: Dict[str, Any] | None = None,
required: List[str] | None = None,
) -> FunctionSchema:
"""Create a FunctionSchema definition that can later be transformed into
the provider-specific format (OpenAI, Gemini, etc.).
The helper keeps the public signature backward-compatible callers that
only pass ``function_name`` and ``description`` continue to work and will
define a parameter-less function.
"""
return FunctionSchema(
name=function_name,
description=description,
properties=properties or {},
required=required or [],
)
class CustomToolManager:
"""Manager for custom tool registration and execution.

View file

@ -1,68 +0,0 @@
from __future__ import annotations
from typing import Any, Dict, List
from api.utils.template_renderer import render_template
from pipecat.adapters.schemas.function_schema import FunctionSchema
from pipecat.adapters.schemas.tools_schema import ToolsSchema
from pipecat.processors.aggregators.llm_context import LLMContext
__all__ = [
"get_function_schema",
"update_llm_context",
"render_template",
]
def get_function_schema(
function_name: str,
description: str,
*,
properties: Dict[str, Any] | None = None,
required: List[str] | None = None,
) -> FunctionSchema:
"""Create a FunctionSchema definition that can later be transformed into
the provider-specific format (OpenAI, Gemini, etc.).
The helper keeps the public signature backward-compatible callers that
only pass ``function_name`` and ``description`` continue to work and will
define a parameter-less function.
"""
return FunctionSchema(
name=function_name,
description=description,
properties=properties or {},
required=required or [],
)
def update_llm_context(
context: LLMContext,
system_message: Dict[str, Any],
functions: List[FunctionSchema],
) -> None:
"""Update *context* with an up-to-date system message and tool list.
This helper removes any previous system messages before inserting the new
*system_message* at the top of the conversation history and then instructs
the LLM which *functions* (a.k.a. tools) are currently available.
"""
# Wrap the provided function schemas in a ToolsSchema so that the adapter
# associated with the current LLM service can convert them to the correct
# provider-specific representation when required.
tools_schema = ToolsSchema(standard_tools=functions)
previous_interactions = context.messages
# Replace the first message if it's a system message, otherwise prepend.
# Keep any system messages that appear in the middle of the conversation.
if previous_interactions and previous_interactions[0]["role"] == "system":
messages = [system_message] + previous_interactions[1:]
else:
messages = [system_message] + previous_interactions
context.set_messages(messages)
if functions:
context.set_tools(tools_schema)