feat: add hybrid text + recording functionality in agents (#191)

* feat: add recording feature in agents * chore: pin pipecat version * feat: show usage in UI * chore: update pipecat
2026-07-01 08:59:46 +02:00 · 2026-03-16 15:04:08 +05:30 · 2026-03-16 15:04:08 +05:30 · 494c60d774
commit 494c60d774
parent f075bcb623
43 changed files with 2865 additions and 397 deletions
--- a/api/services/workflow/pipecat_engine_context_composer.py
+++ b/api/services/workflow/pipecat_engine_context_composer.py
@ -0,0 +1,138 @@
+"""System prompt and function schema composition for PipecatEngine nodes.
+
+Extracts prompt and function composition logic from PipecatEngine into
+reusable functions. Defines recording response mode markers and instructions.
+"""
+
+from typing import TYPE_CHECKING, Callable, Optional
+
+if TYPE_CHECKING:
+    from api.services.workflow.pipecat_engine_custom_tools import CustomToolManager
+    from api.services.workflow.workflow import Node, WorkflowGraph
+
+from api.services.workflow.pipecat_engine_custom_tools import get_function_schema
+from api.services.workflow.tools.knowledge_base import get_knowledge_base_tool
+
+# ---------------------------------------------------------------------------
+# Recording response mode markers
+# ---------------------------------------------------------------------------
+
+RECORDING_MARKER = "●"  # Play pre-recorded audio
+TTS_MARKER = "▸"  # Generate dynamic TTS text
+
+# ---------------------------------------------------------------------------
+# Recording response mode system prompt instructions
+# ---------------------------------------------------------------------------
+
+RECORDING_RESPONSE_MODE_INSTRUCTIONS = """\
+RESPONSE MODE INSTRUCTIONS - MANDATORY FORMAT:
+Every response you generate MUST begin with a response mode indicator.
+You have two modes for responding:
+
+1. DYNAMIC SPEECH (▸): Generate text that will be converted to speech by TTS.
+   Format: `▸` followed by a space and your full spoken response.
+   Example: ▸ Hello! How can I help you today?
+
+2. PRE-RECORDED AUDIO (●): Play a pre-recorded audio message.
+   Format: `●` followed by a space and ONLY the recording_id. Nothing else.
+   Example: ● rec_greeting_01
+
+RULES:
+- Your response MUST start with either `▸` or `●` as the very first character.
+- For `▸` (dynamic speech): Follow with a space and your full response text.
+- For `●` (pre-recorded audio): Follow with a space and ONLY the recording_id. No other text.
+- Use `●` when a pre-recorded message matches the situation well.
+- Use `▸` when you need to generate a dynamic, contextual response.
+- NEVER mix modes in a single response. Choose one."""
+
+
+def compose_system_prompt_for_node(
+    *,
+    node: "Node",
+    workflow: "WorkflowGraph",
+    format_prompt: Callable[[str], str],
+    has_recordings: bool,
+) -> str:
+    """Compose the full system prompt text for a workflow node.
+
+    Combines the global prompt, node-specific prompt, and (when recordings
+    are enabled anywhere in the workflow) the recording response mode
+    instructions into a single string.
+
+    Args:
+        node: The workflow node to compose the prompt for.
+        workflow: The full workflow graph (needed for global node prompt).
+        format_prompt: Callable to render template variables in prompts.
+        has_recordings: Whether any node in the workflow uses recordings.
+
+    Returns:
+        The composed system prompt text.
+    """
+    global_prompt = ""
+    if workflow.global_node_id and node.add_global_prompt:
+        global_node = workflow.nodes[workflow.global_node_id]
+        global_prompt = format_prompt(global_node.prompt)
+
+    formatted_node_prompt = format_prompt(node.prompt)
+
+    parts = [p for p in (global_prompt, formatted_node_prompt) if p]
+
+    if has_recordings:
+        parts.append(RECORDING_RESPONSE_MODE_INSTRUCTIONS)
+        # TODO: Append per-node available recordings list here once
+        # Node.recording_ids is populated. The list should include
+        # recording_id and a short description so the LLM can choose.
+
+    return "\n\n".join(parts)
+
+
+async def compose_functions_for_node(
+    *,
+    node: "Node",
+    builtin_function_schemas: list[dict],
+    custom_tool_manager: Optional["CustomToolManager"],
+) -> list[dict]:
+    """Compose the function/tool schemas for a workflow node.
+
+    Gathers built-in tools, knowledge-base tools, custom tools,
+    and transition function schemas into a single list.
+
+    Args:
+        node: The workflow node to compose functions for.
+        builtin_function_schemas: Pre-computed schemas for built-in tools.
+        custom_tool_manager: Manager for user-defined custom tools (may be None).
+
+    Returns:
+        A list of function schemas to register with the LLM.
+    """
+    functions: list[dict] = []
+
+    # Built-in tools (calculator, timezone)
+    functions.extend(builtin_function_schemas)
+
+    # Knowledge base retrieval tool
+    if node.document_uuids:
+        kb_tool_def = get_knowledge_base_tool(node.document_uuids)
+        kb_schema = get_function_schema(
+            kb_tool_def["function"]["name"],
+            kb_tool_def["function"]["description"],
+            properties=kb_tool_def["function"]["parameters"].get("properties", {}),
+            required=kb_tool_def["function"]["parameters"].get("required", []),
+        )
+        functions.append(kb_schema)
+
+    # Custom tools
+    if node.tool_uuids and custom_tool_manager:
+        custom_tool_schemas = await custom_tool_manager.get_tool_schemas(
+            node.tool_uuids
+        )
+        functions.extend(custom_tool_schemas)
+
+    # Transition function schemas
+    for outgoing_edge in node.out_edges:
+        function_schema = get_function_schema(
+            outgoing_edge.get_function_name(), outgoing_edge.condition
+        )
+        functions.append(function_schema)
+
+    return functions