dograh/api/services/workflow/qa/node_summary.py

"""Node summary generation and caching for per-node QA analysis."""

from typing import Any

from loguru import logger

from api.db import db_client
from api.db.models import WorkflowRunModel
from api.services.pipecat.service_factory import create_llm_service_from_provider
from api.services.workflow.dto import NodeType, QANodeData
from api.services.workflow.qa.llm_config import resolve_llm_config
from api.services.workflow.qa.tracing import create_node_summary_trace
from pipecat.processors.aggregators.llm_context import LLMContext

NODE_SUMMARY_SYSTEM_PROMPT = (
    "You are analyzing a voice AI agent script. This is only a part of a larger script. "
    "Produce a concise summary (2-4 sentences) describing this script purpose, "
    "what the agent should accomplish, and key behaviors. We will be using this "
    "summary to do a QA on the conversation that the agent would do with someone "
    "so try to capture the nuances of the script as much as possible."
)

CONVERSATION_SUMMARY_SYSTEM_PROMPT = (
    "You are summarizing a portion of a voice AI conversation. "
    "Produce a concise summary (3-5 sentences) covering key topics, "
    "information exchanged, and current state. We would be using this "
    "summary in doing a QA of the conversation that the voice AI agent "
    "did with someone so try to capture the nuances of the conversation "
    "as much as possible."
)


def get_node_summary_text(node_summaries: dict, node_id: str) -> str:
    """Extract the summary text from a node_summaries entry.

    Handles both the current format (dict with "summary" key) and the
    legacy format (plain string) for backward compatibility.
    """
    entry = node_summaries.get(node_id)
    if entry is None:
        return ""
    if isinstance(entry, str):
        return entry
    return entry.get("summary", "")


async def ensure_node_summaries(
    workflow_definition: dict,
    definition_id: int | None,
    workflow_run: WorkflowRunModel,
    qa_data: QANodeData,
) -> dict[str, Any]:
    """Ensure every agentNode/startCall node has a summary in the definition.

    Returns the node_summaries dict:
        {node_id: {"summary": "...", "trace_url": "..."}, ...}
    """
    existing_summaries: dict[str, Any] = workflow_definition.get("node_summaries", {})

    nodes = workflow_definition.get("nodes", [])
    summarizable_types = {NodeType.agentNode.value, NodeType.startNode.value}
    nodes_needing_summary = [
        n
        for n in nodes
        if n.get("type") in summarizable_types and n.get("id") not in existing_summaries
    ]

    if not nodes_needing_summary:
        return existing_summaries

    provider, model, api_key, service_kwargs = await resolve_llm_config(
        qa_data, workflow_run
    )
    if not api_key:
        logger.warning("No API key for node summary generation, skipping")
        return existing_summaries

    llm = create_llm_service_from_provider(provider, model, api_key, **service_kwargs)

    updated_summaries = dict(existing_summaries)

    # Collect all tool UUIDs across nodes and fetch them in one query
    all_tool_uuids: set[str] = set()
    for node in nodes_needing_summary:
        node_data = node.get("data", {})
        for uuid in node_data.get("tool_uuids", []):
            all_tool_uuids.add(uuid)

    tool_map: dict[str, Any] = {}
    if all_tool_uuids:
        organization_id = (
            workflow_run.workflow.organization_id if workflow_run.workflow else None
        )
        if organization_id:
            try:
                tools = await db_client.get_tools_by_uuids(
                    list(all_tool_uuids), organization_id
                )
                for t in tools:
                    tool_map[t.tool_uuid] = {
                        "name": t.name,
                        "description": t.description or "",
                    }
            except Exception as e:
                logger.warning(f"Failed to fetch tools for node summaries: {e}")

    # Build a map of outgoing edges per node (edges are also tool calls)
    edges = workflow_definition.get("edges", [])
    outgoing_edges_by_node: dict[str, list[dict]] = {}
    for edge in edges:
        source = edge.get("source")
        if source:
            outgoing_edges_by_node.setdefault(source, []).append(edge)

    for node in nodes_needing_summary:
        node_id = node["id"]
        node_data = node.get("data", {})
        node_name = node_data.get("name", "Unnamed")

        # Build a description of the node for the LLM
        node_info_parts = [f"Node name: {node_name}"]
        if node_data.get("prompt"):
            node_info_parts.append(f"Agent prompt:\n{node_data['prompt']}")

        # Collect all available tools: custom tools + outgoing edges
        tool_descriptions = []

        node_tool_uuids = node_data.get("tool_uuids", [])
        for uuid in node_tool_uuids:
            tool_info = tool_map.get(uuid)
            if tool_info:
                desc = f"- {tool_info['name']}"
                if tool_info["description"]:
                    desc += f": {tool_info['description']}"
                tool_descriptions.append(desc)

        for edge in outgoing_edges_by_node.get(node_id, []):
            edge_data = edge.get("data", {})
            label = edge_data.get("label", "")
            condition = edge_data.get("condition", "")
            if label:
                desc = f"- {label}"
                if condition:
                    desc += f": {condition}"
                tool_descriptions.append(desc)

        if tool_descriptions:
            node_info_parts.append("Available tools:\n" + "\n".join(tool_descriptions))
        node_info = "\n".join(node_info_parts)
        messages = [
            {"role": "user", "content": node_info},
        ]

        try:
            context = LLMContext()
            context.set_messages(messages)
            summary_text = (
                await llm.run_inference(
                    context, system_instruction=NODE_SUMMARY_SYSTEM_PROMPT
                )
                or ""
            )
        except Exception as e:
            logger.warning(f"Failed to generate summary for node {node_id}: {e}")
            updated_summaries[node_id] = {"summary": ""}
            continue

        # Create a Langfuse trace for this summary generation
        trace_url = create_node_summary_trace(
            model, messages, summary_text, node_name, NODE_SUMMARY_SYSTEM_PROMPT
        )

        entry: dict[str, Any] = {"summary": summary_text}
        if trace_url:
            entry["trace_url"] = trace_url
        updated_summaries[node_id] = entry

    # Persist to DB
    if definition_id and updated_summaries != existing_summaries:
        try:
            await db_client.update_definition_node_summaries(
                definition_id, updated_summaries
            )
        except Exception as e:
            logger.warning(f"Failed to persist node summaries: {e}")

    return updated_summaries