rowboat/apps/agents/src/graph/guardrails.py

# Guardrails
from src.utils.common import generate_llm_output
import os
import copy

from src.swarm.types import Response, Agent

from src.utils.common import common_logger, generate_openai_output, update_tokens_used
logger = common_logger

def classify_hallucination(context: str, assistant_response: str, chat_history: list, model: str) -> str:
    """
    Checks if an assistant's response contains hallucinations by comparing against provided context.

    Args:
        context (str): The context/knowledge base to check the response against
        assistant_response (str): The response from the assistant to validate
        chat_history (list): List of previous chat messages for context

    Returns:
        str: Verdict indicating level of hallucination:
            'yes-absolute' - completely supported by context
            'yes-common-sensical' - supported with common sense interpretation 
            'no-absolute' - not supported by context
            'no-subtle' - not supported but difference is subtle
    """
    chat_history_str = "\n".join([f"{message['role']}: {message['content']}" for message in chat_history])
    
    prompt = f"""
    You are a guardrail agent. Your job is to check if the response is hallucinating.

    ------------------------------------------------------------------------
    Here is the context:
    {context}

    ------------------------------------------------------------------------
    Here is the chat history message:
    {chat_history_str}

    ------------------------------------------------------------------------
    Here is the response:
    {assistant_response}

    ------------------------------------------------------------------------
    As a hallucination guardrail, your job is to go through each line of the response and check if it is completely supported by the context. Even if a single line is not supported, the response is no.

    Output a single verdict for the entire response. don't provide any reasoning. The output classes are

    yes-absolute:  completely supported by the context
    yes-common-sensical: but with some common sense interpretation
    no-absolute:  not supported by the context
    no-subtle:  not supported by the context but the difference is subtle

    Output of of the classes: 
    verdict : yes-absolute/yes-common-sensical/no-absolute/no-subtle  

    Example 1: The response is completely supported by the context.
    User Input: 
    Context: "Our airline provides complimentary meals and beverages on all international flights. Passengers are allowed one carry-on bag and one personal item."
    Chat History: 
    User: "Do international flights with your airline offer free meals?"
    Response: "Yes, all international flights with our airline offer free meals and beverages."
    Output:    verdict: yes-absolute

    Example 2: The response is generally true and could be deduced with common sense interpretation, though not explicitly stated in the context.
    User Input: 
    Context: "Flights may experience delays due to weather conditions. In such cases, the airline staff will provide updates at the airport."
    Chat History: 
    User: "Will there be announcements if my flight is delayed?"
    Response: "Yes, if your flight is delayed, there will be announcements at the airport."
    Output:    verdict: yes-common-sensical 

    Example 3: The response is not supported by the context and contains glaring inaccuracies.
    User Input: 
    Context: "You can cancel your ticket online up to 24 hours before the flight's departure time and receive a full refund."
    Chat History: 
    User: "Can I get a refund if I cancel 12 hours before the flight?"
    Response: "Yes, you can get a refund if you cancel 12 hours before the flight."
    Output:    verdict: no-absolute

    Example 4: The response is not supported by the context but the difference is subtle.
    User Input: 
    Context: "Our frequent flyer program offers discounts on checked bags for members who have achieved Gold status."
    Chat History: 
    User: "As a member, do I get discounts on checked bags?"
    Response: "Yes, members of our frequent flyer program get discounts on checked bags."
    Output:    verdict: no-subtle        
    """
    messages = [
        {
            "role": "system",
            "content": prompt,
        },
    ]
    response = generate_llm_output(messages, model)
    return response

def post_process_response(messages: list, post_processing_agent_name: str, post_process_instructions: str, style_prompt: str = None, context: str = None, model: str = "gpt-4o", tokens_used: dict = {}, last_agent: Agent = None) -> dict:
    agent_instructions = last_agent.instructions
    agent_history = last_agent.history
    # agent_instructions = ''
    # agent_history = []

    pending_msg = copy.deepcopy(messages[-1])
    logger.debug(f"Pending message keys: {pending_msg.keys()}")

    skip = False
    
    if pending_msg.get("tool_calls"):
        logger.info("Last message is a tool call, skipping post processing and setting last message to external")
        skip = True

    elif not pending_msg['response_type'] == "internal":
        logger.info("Last message is not internal, skipping post processing and setting last message to external")
        skip = True
    
    elif not pending_msg['content']:
        logger.info("Last message has no content, skipping post processing and setting last message to external")
        skip = True
    
    elif not post_process_instructions:
        logger.info("No post process instructions, skipping post processing and setting last message to external")
        skip = True

    if skip:
        pending_msg['response_type'] = "external"
        response = Response(
            messages=[],
            tokens_used=tokens_used,
            agent=last_agent,
            error_msg=''
        )
        return response
    
    agent_history_str = f"\n{'*'*100}\n".join([f"Role: {message['role']} | Content: {message.get('content', 'None')} | Tool Calls: {message.get('tool_calls', 'None')}" for message in agent_history[:-1]])
    logger.debug(f"Agent history: {agent_history_str}")

    prompt = f"""
        # ROLE

        You are a post processing agent responsible for rewriting a response generated by an agent, according to instructions provided below. Ensure that the response you produce adheres to the instructions provided to you (if any).
        ------------------------------------------------------------------------

        # ADDITIONAL INSTRUCTIONS

        Here are additional instructions that the admin might have configured for you:
        {post_process_instructions}

        ------------------------------------------------------------------------
        
        # CHAT HISTORY

        Here is the chat history:
        {agent_history_str}
    """
    if context:
        context_prompt = f"""
        ------------------------------------------------------------------------
        # CONTEXT

        Here is the context:
        {context}
        """
        prompt += context_prompt

    if style_prompt:
        style_prompt = f"""
        ------------------------------------------------------------------------
        # STYLE PROMPT

        Here is the style prompt:
        {style_prompt}
        """
        prompt += style_prompt

    agent_response_and_instructions = f"""

    ------------------------------------------------------------------------
    # AGENT INSTRUCTIONS

    Here are the instructions to the agent generating the response:
    {agent_instructions}

    ------------------------------------------------------------------------
    # AGENT RESPONSE

    Here is the response that the agent has generated:
    {pending_msg['content']}
        
    """
    prompt += agent_response_and_instructions

    logger.debug(f"Sanitizing response for style. Original response: {pending_msg['content']}")
    completion = generate_openai_output(
        messages=[
            {"role": "system", "content": prompt}
        ],
        model = model,
        return_completion=True
    )
    content = completion.choices[0].message.content
    if content:
        content = content.strip().lstrip().rstrip()
        tokens_used = update_tokens_used(provider="openai", model=model, tokens_used=tokens_used, completion=completion)
        logger.debug(f"Response after style check: {content}, tokens used: {tokens_used}")

    pending_msg['content'] = content if content else pending_msg['content']
    pending_msg['response_type'] = "external"
    pending_msg['sender'] = pending_msg['sender'] + f' >> {post_processing_agent_name}'

    response = Response(
        messages=[pending_msg],
        tokens_used=tokens_used,
        agent=last_agent,
        error_msg=''
    )

    return response
Add agents with custom swarm implementation 2025-01-13 18:20:38 +05:30			`# Guardrails`
			`from src.utils.common import generate_llm_output`
			`import os`
			`import copy`

			`from src.swarm.types import Response, Agent`

			`from src.utils.common import common_logger, generate_openai_output, update_tokens_used`
			`logger = common_logger`

			`def classify_hallucination(context: str, assistant_response: str, chat_history: list, model: str) -> str:`
			`"""`
			`Checks if an assistant's response contains hallucinations by comparing against provided context.`

			`Args:`
			`context (str): The context/knowledge base to check the response against`
			`assistant_response (str): The response from the assistant to validate`
			`chat_history (list): List of previous chat messages for context`

			`Returns:`
			`str: Verdict indicating level of hallucination:`
			`'yes-absolute' - completely supported by context`
			`'yes-common-sensical' - supported with common sense interpretation`
			`'no-absolute' - not supported by context`
			`'no-subtle' - not supported but difference is subtle`
			`"""`
			`chat_history_str = "\n".join([f"{message['role']}: {message['content']}" for message in chat_history])`

			`prompt = f"""`
			`You are a guardrail agent. Your job is to check if the response is hallucinating.`

			`------------------------------------------------------------------------`
			`Here is the context:`
			`{context}`

			`------------------------------------------------------------------------`
			`Here is the chat history message:`
			`{chat_history_str}`

			`------------------------------------------------------------------------`
			`Here is the response:`
			`{assistant_response}`

			`------------------------------------------------------------------------`
			`As a hallucination guardrail, your job is to go through each line of the response and check if it is completely supported by the context. Even if a single line is not supported, the response is no.`

			`Output a single verdict for the entire response. don't provide any reasoning. The output classes are`

			`yes-absolute: completely supported by the context`
			`yes-common-sensical: but with some common sense interpretation`
			`no-absolute: not supported by the context`
			`no-subtle: not supported by the context but the difference is subtle`

			`Output of of the classes:`
			`verdict : yes-absolute/yes-common-sensical/no-absolute/no-subtle`

			`Example 1: The response is completely supported by the context.`
			`User Input:`
			`Context: "Our airline provides complimentary meals and beverages on all international flights. Passengers are allowed one carry-on bag and one personal item."`
			`Chat History:`
			`User: "Do international flights with your airline offer free meals?"`
			`Response: "Yes, all international flights with our airline offer free meals and beverages."`
			`Output: verdict: yes-absolute`

			`Example 2: The response is generally true and could be deduced with common sense interpretation, though not explicitly stated in the context.`
			`User Input:`
			`Context: "Flights may experience delays due to weather conditions. In such cases, the airline staff will provide updates at the airport."`
			`Chat History:`
			`User: "Will there be announcements if my flight is delayed?"`
			`Response: "Yes, if your flight is delayed, there will be announcements at the airport."`
			`Output: verdict: yes-common-sensical`

			`Example 3: The response is not supported by the context and contains glaring inaccuracies.`
			`User Input:`
			`Context: "You can cancel your ticket online up to 24 hours before the flight's departure time and receive a full refund."`
			`Chat History:`
			`User: "Can I get a refund if I cancel 12 hours before the flight?"`
			`Response: "Yes, you can get a refund if you cancel 12 hours before the flight."`
			`Output: verdict: no-absolute`

			`Example 4: The response is not supported by the context but the difference is subtle.`
			`User Input:`
			`Context: "Our frequent flyer program offers discounts on checked bags for members who have achieved Gold status."`
			`Chat History:`
			`User: "As a member, do I get discounts on checked bags?"`
			`Response: "Yes, members of our frequent flyer program get discounts on checked bags."`
			`Output: verdict: no-subtle`
			`"""`
			`messages = [`
			`{`
			`"role": "system",`
			`"content": prompt,`
			`},`
			`]`
			`response = generate_llm_output(messages, model)`
			`return response`

			`def post_process_response(messages: list, post_processing_agent_name: str, post_process_instructions: str, style_prompt: str = None, context: str = None, model: str = "gpt-4o", tokens_used: dict = {}, last_agent: Agent = None) -> dict:`
			`agent_instructions = last_agent.instructions`
			`agent_history = last_agent.history`
			`# agent_instructions = ''`
			`# agent_history = []`

			`pending_msg = copy.deepcopy(messages[-1])`
			`logger.debug(f"Pending message keys: {pending_msg.keys()}")`

			`skip = False`

			`if pending_msg.get("tool_calls"):`
			`logger.info("Last message is a tool call, skipping post processing and setting last message to external")`
			`skip = True`

			`elif not pending_msg['response_type'] == "internal":`
			`logger.info("Last message is not internal, skipping post processing and setting last message to external")`
			`skip = True`

			`elif not pending_msg['content']:`
			`logger.info("Last message has no content, skipping post processing and setting last message to external")`
			`skip = True`

			`elif not post_process_instructions:`
			`logger.info("No post process instructions, skipping post processing and setting last message to external")`
			`skip = True`

			`if skip:`
			`pending_msg['response_type'] = "external"`
			`response = Response(`
			`messages=[],`
			`tokens_used=tokens_used,`
			`agent=last_agent,`
			`error_msg=''`
			`)`
			`return response`

			`agent_history_str = f"\n{''100}\n".join([f"Role: {message['role']} \| Content: {message.get('content', 'None')} \| Tool Calls: {message.get('tool_calls', 'None')}" for message in agent_history[:-1]])`
			`logger.debug(f"Agent history: {agent_history_str}")`

			`prompt = f"""`
			`# ROLE`

Update post process prompt 2025-02-05 10:10:04 +05:30			`You are a post processing agent responsible for rewriting a response generated by an agent, according to instructions provided below. Ensure that the response you produce adheres to the instructions provided to you (if any).`
Add agents with custom swarm implementation 2025-01-13 18:20:38 +05:30			`------------------------------------------------------------------------`

			`# ADDITIONAL INSTRUCTIONS`

			`Here are additional instructions that the admin might have configured for you:`
			`{post_process_instructions}`

			`------------------------------------------------------------------------`

			`# CHAT HISTORY`

			`Here is the chat history:`
			`{agent_history_str}`
			`"""`
			`if context:`
			`context_prompt = f"""`
			`------------------------------------------------------------------------`
			`# CONTEXT`

			`Here is the context:`
			`{context}`
			`"""`
			`prompt += context_prompt`

			`if style_prompt:`
			`style_prompt = f"""`
			`------------------------------------------------------------------------`
			`# STYLE PROMPT`

			`Here is the style prompt:`
			`{style_prompt}`
			`"""`
			`prompt += style_prompt`

			`agent_response_and_instructions = f"""`

			`------------------------------------------------------------------------`
			`# AGENT INSTRUCTIONS`

			`Here are the instructions to the agent generating the response:`
			`{agent_instructions}`

			`------------------------------------------------------------------------`
			`# AGENT RESPONSE`

			`Here is the response that the agent has generated:`
			`{pending_msg['content']}`

			`"""`
			`prompt += agent_response_and_instructions`

			`logger.debug(f"Sanitizing response for style. Original response: {pending_msg['content']}")`
			`completion = generate_openai_output(`
			`messages=[`
			`{"role": "system", "content": prompt}`
			`],`
			`model = model,`
			`return_completion=True`
			`)`
			`content = completion.choices[0].message.content`
			`if content:`
			`content = content.strip().lstrip().rstrip()`
			`tokens_used = update_tokens_used(provider="openai", model=model, tokens_used=tokens_used, completion=completion)`
			`logger.debug(f"Response after style check: {content}, tokens used: {tokens_used}")`

			`pending_msg['content'] = content if content else pending_msg['content']`
			`pending_msg['response_type'] = "external"`
			`pending_msg['sender'] = pending_msg['sender'] + f' >> {post_processing_agent_name}'`

			`response = Response(`
			`messages=[pending_msg],`
			`tokens_used=tokens_used,`
			`agent=last_agent,`
			`error_msg=''`
			`)`

			`return response`