feat: add hybrid text + recording functionality in agents (#191)

* feat: add recording feature in agents * chore: pin pipecat version * feat: show usage in UI * chore: update pipecat
2026-07-01 08:59:46 +02:00 · 2026-03-16 15:04:08 +05:30 · 2026-03-16 15:04:08 +05:30 · 494c60d774
commit 494c60d774
parent f075bcb623
43 changed files with 2865 additions and 397 deletions
--- a/api/services/workflow/pipecat_engine.py
+++ b/api/services/workflow/pipecat_engine.py
@ -5,6 +5,7 @@ from api.services.workflow.disposition_mapper import (
    get_organization_id_from_workflow_run,
 )
 from api.services.workflow.workflow import Node, WorkflowGraph
+from pipecat.adapters.schemas.tools_schema import ToolsSchema
 from pipecat.frames.frames import (
    BotStartedSpeakingFrame,
    BotStoppedSpeakingFrame,
@ -16,6 +17,7 @@ from pipecat.frames.frames import (
 from pipecat.pipeline.task import PipelineTask
 from pipecat.processors.aggregators.llm_context import LLMContext
 from pipecat.services.llm_service import FunctionCallParams
+from pipecat.services.settings import LLMSettings
 from pipecat.utils.enums import EndTaskReason

 if TYPE_CHECKING:
@ -31,18 +33,19 @@ import asyncio
 from loguru import logger

 from api.services.workflow import pipecat_engine_callbacks as engine_callbacks
-from api.services.workflow.pipecat_engine_custom_tools import CustomToolManager
-from api.services.workflow.pipecat_engine_utils import (
+from api.services.workflow.pipecat_engine_context_composer import (
+    compose_functions_for_node,
+    compose_system_prompt_for_node,
+)
+from api.services.workflow.pipecat_engine_custom_tools import (
+    CustomToolManager,
    get_function_schema,
-    render_template,
-    update_llm_context,
 )
 from api.services.workflow.pipecat_engine_variable_extractor import (
    VariableExtractionManager,
 )
 from api.services.workflow.tools.calculator import get_calculator_tools, safe_calculator
 from api.services.workflow.tools.knowledge_base import (
-    get_knowledge_base_tool,
    retrieve_from_knowledge_base,
 )
 from api.services.workflow.tools.timezone import (
@ -50,6 +53,7 @@ from api.services.workflow.tools.timezone import (
    get_current_time,
    get_time_tools,
 )
+from api.utils.template_renderer import render_template


 class PipecatEngine:
@ -68,6 +72,7 @@ class PipecatEngine:
        embeddings_api_key: Optional[str] = None,
        embeddings_model: Optional[str] = None,
        embeddings_base_url: Optional[str] = None,
+        has_recordings: bool = False,
    ):
        self.task = task
        self.llm = llm
@ -113,6 +118,10 @@ class PipecatEngine:
        # Audio configuration (set via set_audio_config from _run_pipeline)
        self._audio_config = None

+        # True when the workflow has active recordings; enables recording
+        # response mode instructions on all nodes for in-context learning.
+        self._has_recordings: bool = has_recordings
+
    async def _get_organization_id(self) -> Optional[int]:
        """Get and cache the organization ID from workflow run."""
        if self._custom_tool_manager:
@ -194,15 +203,14 @@ class PipecatEngine:
            logger.error(f"Error initializing {self.__class__.__name__}: {e}")
            raise

-    def _get_function_schema(self, function_name: str, description: str):
-        """Thin wrapper around utils.get_function_schema for backwards compatibility."""
+    async def _update_llm_context(self, system_prompt: str, functions: list[dict]):
+        """Update LLM settings with the composed system prompt and tool list."""

-        return get_function_schema(function_name, description)
+        await self.llm._update_settings(LLMSettings(system_instruction=system_prompt))

-    async def _update_llm_context(self, system_message: dict, functions: list[dict]):
-        """Delegate context update to the shared workflow.utils implementation."""
-
-        update_llm_context(self.context, system_message, functions)
+        if functions:
+            tools_schema = ToolsSchema(standard_tools=functions)
+            self.context.set_tools(tools_schema)

    def _format_prompt(self, prompt: str) -> str:
        """Delegate prompt formatting to the shared workflow.utils implementation."""
@ -473,12 +481,19 @@ class PipecatEngine:
        if node.document_uuids:
            await self._register_knowledge_base_function(node.document_uuids)

-        # Set up system message and functions
-        (
-            system_message,
-            functions,
-        ) = await self._compose_system_message_functions_for_node(node)
-        await self._update_llm_context(system_message, functions)
+        # Compose prompt and functions via the context composer module
+        system_prompt = compose_system_prompt_for_node(
+            node=node,
+            workflow=self.workflow,
+            format_prompt=self._format_prompt,
+            has_recordings=self._has_recordings,
+        )
+        functions = await compose_functions_for_node(
+            node=node,
+            builtin_function_schemas=self.builtin_function_schemas,
+            custom_tool_manager=self._custom_tool_manager,
+        )
+        await self._update_llm_context(system_prompt, functions)

    async def set_node(self, node_id: str):
        """
@ -610,62 +625,6 @@ class PipecatEngine:
        )
        await self.task.queue_frame(frame_to_push)

-    async def _compose_system_message_functions_for_node(
-        self, node: "Node"
-    ) -> tuple[list[dict], list[dict]]:
-        """Generate the system messages and function schemas for the given node.
-
-        This performs the same formatting logic used when entering a node but
-        does **not** register the functions with the LLM; callers are
-        responsible for that.
-        """
-
-        global_prompt = ""
-        if self.workflow.global_node_id and node.add_global_prompt:
-            global_node = self.workflow.nodes[self.workflow.global_node_id]
-            global_prompt = self._format_prompt(global_node.prompt)
-
-        functions: list[dict] = []
-
-        # Add built-in function schemas (calculator and timezone tools)
-        functions.extend(self.builtin_function_schemas)
-
-        # Add knowledge base retrieval tool if node has documents
-        if node.document_uuids:
-            kb_tool_def = get_knowledge_base_tool(node.document_uuids)
-            kb_schema = get_function_schema(
-                kb_tool_def["function"]["name"],
-                kb_tool_def["function"]["description"],
-                properties=kb_tool_def["function"]["parameters"].get("properties", {}),
-                required=kb_tool_def["function"]["parameters"].get("required", []),
-            )
-            functions.append(kb_schema)
-
-        # Add custom tools from node.tool_uuids
-        if node.tool_uuids and self._custom_tool_manager:
-            custom_tool_schemas = await self._custom_tool_manager.get_tool_schemas(
-                node.tool_uuids
-            )
-            functions.extend(custom_tool_schemas)
-
-        # Transition functions (schema only; registration handled elsewhere)
-        for outgoing_edge in node.out_edges:
-            function_schema = self._get_function_schema(
-                outgoing_edge.get_function_name(), outgoing_edge.condition
-            )
-            functions.append(function_schema)
-
-        formatted_node_prompt = self._format_prompt(node.prompt)
-
-        system_message = {
-            "role": "system",
-            "content": "\n\n".join(
-                p for p in (global_prompt, formatted_node_prompt) if p
-            ),
-        }
-
-        return system_message, functions
-
    async def should_mute_user(self, frame: "Frame") -> bool:
        """
        Callback for CallbackUserMuteStrategy to determine if the user should be muted.
--- a/api/services/workflow/pipecat_engine_context_composer.py
+++ b/api/services/workflow/pipecat_engine_context_composer.py
@ -0,0 +1,138 @@
+"""System prompt and function schema composition for PipecatEngine nodes.
+
+Extracts prompt and function composition logic from PipecatEngine into
+reusable functions. Defines recording response mode markers and instructions.
+"""
+
+from typing import TYPE_CHECKING, Callable, Optional
+
+if TYPE_CHECKING:
+    from api.services.workflow.pipecat_engine_custom_tools import CustomToolManager
+    from api.services.workflow.workflow import Node, WorkflowGraph
+
+from api.services.workflow.pipecat_engine_custom_tools import get_function_schema
+from api.services.workflow.tools.knowledge_base import get_knowledge_base_tool
+
+# ---------------------------------------------------------------------------
+# Recording response mode markers
+# ---------------------------------------------------------------------------
+
+RECORDING_MARKER = "●"  # Play pre-recorded audio
+TTS_MARKER = "▸"  # Generate dynamic TTS text
+
+# ---------------------------------------------------------------------------
+# Recording response mode system prompt instructions
+# ---------------------------------------------------------------------------
+
+RECORDING_RESPONSE_MODE_INSTRUCTIONS = """\
+RESPONSE MODE INSTRUCTIONS - MANDATORY FORMAT:
+Every response you generate MUST begin with a response mode indicator.
+You have two modes for responding:
+
+1. DYNAMIC SPEECH (▸): Generate text that will be converted to speech by TTS.
+   Format: `▸` followed by a space and your full spoken response.
+   Example: ▸ Hello! How can I help you today?
+
+2. PRE-RECORDED AUDIO (●): Play a pre-recorded audio message.
+   Format: `●` followed by a space and ONLY the recording_id. Nothing else.
+   Example: ● rec_greeting_01
+
+RULES:
+- Your response MUST start with either `▸` or `●` as the very first character.
+- For `▸` (dynamic speech): Follow with a space and your full response text.
+- For `●` (pre-recorded audio): Follow with a space and ONLY the recording_id. No other text.
+- Use `●` when a pre-recorded message matches the situation well.
+- Use `▸` when you need to generate a dynamic, contextual response.
+- NEVER mix modes in a single response. Choose one."""
+
+
+def compose_system_prompt_for_node(
+    *,
+    node: "Node",
+    workflow: "WorkflowGraph",
+    format_prompt: Callable[[str], str],
+    has_recordings: bool,
+) -> str:
+    """Compose the full system prompt text for a workflow node.
+
+    Combines the global prompt, node-specific prompt, and (when recordings
+    are enabled anywhere in the workflow) the recording response mode
+    instructions into a single string.
+
+    Args:
+        node: The workflow node to compose the prompt for.
+        workflow: The full workflow graph (needed for global node prompt).
+        format_prompt: Callable to render template variables in prompts.
+        has_recordings: Whether any node in the workflow uses recordings.
+
+    Returns:
+        The composed system prompt text.
+    """
+    global_prompt = ""
+    if workflow.global_node_id and node.add_global_prompt:
+        global_node = workflow.nodes[workflow.global_node_id]
+        global_prompt = format_prompt(global_node.prompt)
+
+    formatted_node_prompt = format_prompt(node.prompt)
+
+    parts = [p for p in (global_prompt, formatted_node_prompt) if p]
+
+    if has_recordings:
+        parts.append(RECORDING_RESPONSE_MODE_INSTRUCTIONS)
+        # TODO: Append per-node available recordings list here once
+        # Node.recording_ids is populated. The list should include
+        # recording_id and a short description so the LLM can choose.
+
+    return "\n\n".join(parts)
+
+
+async def compose_functions_for_node(
+    *,
+    node: "Node",
+    builtin_function_schemas: list[dict],
+    custom_tool_manager: Optional["CustomToolManager"],
+) -> list[dict]:
+    """Compose the function/tool schemas for a workflow node.
+
+    Gathers built-in tools, knowledge-base tools, custom tools,
+    and transition function schemas into a single list.
+
+    Args:
+        node: The workflow node to compose functions for.
+        builtin_function_schemas: Pre-computed schemas for built-in tools.
+        custom_tool_manager: Manager for user-defined custom tools (may be None).
+
+    Returns:
+        A list of function schemas to register with the LLM.
+    """
+    functions: list[dict] = []
+
+    # Built-in tools (calculator, timezone)
+    functions.extend(builtin_function_schemas)
+
+    # Knowledge base retrieval tool
+    if node.document_uuids:
+        kb_tool_def = get_knowledge_base_tool(node.document_uuids)
+        kb_schema = get_function_schema(
+            kb_tool_def["function"]["name"],
+            kb_tool_def["function"]["description"],
+            properties=kb_tool_def["function"]["parameters"].get("properties", {}),
+            required=kb_tool_def["function"]["parameters"].get("required", []),
+        )
+        functions.append(kb_schema)
+
+    # Custom tools
+    if node.tool_uuids and custom_tool_manager:
+        custom_tool_schemas = await custom_tool_manager.get_tool_schemas(
+            node.tool_uuids
+        )
+        functions.extend(custom_tool_schemas)
+
+    # Transition function schemas
+    for outgoing_edge in node.out_edges:
+        function_schema = get_function_schema(
+            outgoing_edge.get_function_name(), outgoing_edge.condition
+        )
+        functions.append(function_schema)
+
+    return functions
--- a/api/services/workflow/pipecat_engine_custom_tools.py
+++ b/api/services/workflow/pipecat_engine_custom_tools.py
@ -10,7 +10,7 @@ import asyncio
 import re
 import time
 import uuid
-from typing import TYPE_CHECKING, Any, Optional
+from typing import TYPE_CHECKING, Any, Dict, List, Optional

 from loguru import logger

@ -23,7 +23,6 @@ from api.services.telephony.transfer_event_protocol import TransferContext
 from api.services.workflow.disposition_mapper import (
    get_organization_id_from_workflow_run,
 )
-from api.services.workflow.pipecat_engine_utils import get_function_schema
 from api.services.workflow.tools.custom_tool import (
    execute_http_tool,
    tool_to_function_schema,
@ -42,6 +41,29 @@ if TYPE_CHECKING:
    from api.services.workflow.pipecat_engine import PipecatEngine


+def get_function_schema(
+    function_name: str,
+    description: str,
+    *,
+    properties: Dict[str, Any] | None = None,
+    required: List[str] | None = None,
+) -> FunctionSchema:
+    """Create a FunctionSchema definition that can later be transformed into
+    the provider-specific format (OpenAI, Gemini, etc.).
+
+    The helper keeps the public signature backward-compatible – callers that
+    only pass ``function_name`` and ``description`` continue to work and will
+    define a parameter-less function.
+    """
+
+    return FunctionSchema(
+        name=function_name,
+        description=description,
+        properties=properties or {},
+        required=required or [],
+    )
+
+
 class CustomToolManager:
    """Manager for custom tool registration and execution.

--- a/api/services/workflow/pipecat_engine_utils.py
+++ b/api/services/workflow/pipecat_engine_utils.py
@ -1,68 +0,0 @@
-from __future__ import annotations
-
-from typing import Any, Dict, List
-
-from api.utils.template_renderer import render_template
-from pipecat.adapters.schemas.function_schema import FunctionSchema
-from pipecat.adapters.schemas.tools_schema import ToolsSchema
-from pipecat.processors.aggregators.llm_context import LLMContext
-
-__all__ = [
-    "get_function_schema",
-    "update_llm_context",
-    "render_template",
-]
-
-
-def get_function_schema(
-    function_name: str,
-    description: str,
-    *,
-    properties: Dict[str, Any] | None = None,
-    required: List[str] | None = None,
-) -> FunctionSchema:
-    """Create a FunctionSchema definition that can later be transformed into
-    the provider-specific format (OpenAI, Gemini, etc.).
-
-    The helper keeps the public signature backward-compatible – callers that
-    only pass ``function_name`` and ``description`` continue to work and will
-    define a parameter-less function.
-    """
-
-    return FunctionSchema(
-        name=function_name,
-        description=description,
-        properties=properties or {},
-        required=required or [],
-    )
-
-
-def update_llm_context(
-    context: LLMContext,
-    system_message: Dict[str, Any],
-    functions: List[FunctionSchema],
-) -> None:
-    """Update *context* with an up-to-date system message and tool list.
-
-    This helper removes any previous system messages before inserting the new
-    *system_message* at the top of the conversation history and then instructs
-    the LLM which *functions* (a.k.a. tools) are currently available.
-    """
-
-    # Wrap the provided function schemas in a ToolsSchema so that the adapter
-    # associated with the current LLM service can convert them to the correct
-    # provider-specific representation when required.
-    tools_schema = ToolsSchema(standard_tools=functions)
-    previous_interactions = context.messages
-
-    # Replace the first message if it's a system message, otherwise prepend.
-    # Keep any system messages that appear in the middle of the conversation.
-    if previous_interactions and previous_interactions[0]["role"] == "system":
-        messages = [system_message] + previous_interactions[1:]
-    else:
-        messages = [system_message] + previous_interactions
-
-    context.set_messages(messages)
-
-    if functions:
-        context.set_tools(tools_schema)