rowboat/apps/agents/src/graph/guardrails.py
2025-02-05 10:10:04 +05:30

218 lines
8.6 KiB
Python

# Guardrails
from src.utils.common import generate_llm_output
import os
import copy
from src.swarm.types import Response, Agent
from src.utils.common import common_logger, generate_openai_output, update_tokens_used
logger = common_logger
def classify_hallucination(context: str, assistant_response: str, chat_history: list, model: str) -> str:
"""
Checks if an assistant's response contains hallucinations by comparing against provided context.
Args:
context (str): The context/knowledge base to check the response against
assistant_response (str): The response from the assistant to validate
chat_history (list): List of previous chat messages for context
Returns:
str: Verdict indicating level of hallucination:
'yes-absolute' - completely supported by context
'yes-common-sensical' - supported with common sense interpretation
'no-absolute' - not supported by context
'no-subtle' - not supported but difference is subtle
"""
chat_history_str = "\n".join([f"{message['role']}: {message['content']}" for message in chat_history])
prompt = f"""
You are a guardrail agent. Your job is to check if the response is hallucinating.
------------------------------------------------------------------------
Here is the context:
{context}
------------------------------------------------------------------------
Here is the chat history message:
{chat_history_str}
------------------------------------------------------------------------
Here is the response:
{assistant_response}
------------------------------------------------------------------------
As a hallucination guardrail, your job is to go through each line of the response and check if it is completely supported by the context. Even if a single line is not supported, the response is no.
Output a single verdict for the entire response. don't provide any reasoning. The output classes are
yes-absolute: completely supported by the context
yes-common-sensical: but with some common sense interpretation
no-absolute: not supported by the context
no-subtle: not supported by the context but the difference is subtle
Output of of the classes:
verdict : yes-absolute/yes-common-sensical/no-absolute/no-subtle
Example 1: The response is completely supported by the context.
User Input:
Context: "Our airline provides complimentary meals and beverages on all international flights. Passengers are allowed one carry-on bag and one personal item."
Chat History:
User: "Do international flights with your airline offer free meals?"
Response: "Yes, all international flights with our airline offer free meals and beverages."
Output: verdict: yes-absolute
Example 2: The response is generally true and could be deduced with common sense interpretation, though not explicitly stated in the context.
User Input:
Context: "Flights may experience delays due to weather conditions. In such cases, the airline staff will provide updates at the airport."
Chat History:
User: "Will there be announcements if my flight is delayed?"
Response: "Yes, if your flight is delayed, there will be announcements at the airport."
Output: verdict: yes-common-sensical
Example 3: The response is not supported by the context and contains glaring inaccuracies.
User Input:
Context: "You can cancel your ticket online up to 24 hours before the flight's departure time and receive a full refund."
Chat History:
User: "Can I get a refund if I cancel 12 hours before the flight?"
Response: "Yes, you can get a refund if you cancel 12 hours before the flight."
Output: verdict: no-absolute
Example 4: The response is not supported by the context but the difference is subtle.
User Input:
Context: "Our frequent flyer program offers discounts on checked bags for members who have achieved Gold status."
Chat History:
User: "As a member, do I get discounts on checked bags?"
Response: "Yes, members of our frequent flyer program get discounts on checked bags."
Output: verdict: no-subtle
"""
messages = [
{
"role": "system",
"content": prompt,
},
]
response = generate_llm_output(messages, model)
return response
def post_process_response(messages: list, post_processing_agent_name: str, post_process_instructions: str, style_prompt: str = None, context: str = None, model: str = "gpt-4o", tokens_used: dict = {}, last_agent: Agent = None) -> dict:
agent_instructions = last_agent.instructions
agent_history = last_agent.history
# agent_instructions = ''
# agent_history = []
pending_msg = copy.deepcopy(messages[-1])
logger.debug(f"Pending message keys: {pending_msg.keys()}")
skip = False
if pending_msg.get("tool_calls"):
logger.info("Last message is a tool call, skipping post processing and setting last message to external")
skip = True
elif not pending_msg['response_type'] == "internal":
logger.info("Last message is not internal, skipping post processing and setting last message to external")
skip = True
elif not pending_msg['content']:
logger.info("Last message has no content, skipping post processing and setting last message to external")
skip = True
elif not post_process_instructions:
logger.info("No post process instructions, skipping post processing and setting last message to external")
skip = True
if skip:
pending_msg['response_type'] = "external"
response = Response(
messages=[],
tokens_used=tokens_used,
agent=last_agent,
error_msg=''
)
return response
agent_history_str = f"\n{'*'*100}\n".join([f"Role: {message['role']} | Content: {message.get('content', 'None')} | Tool Calls: {message.get('tool_calls', 'None')}" for message in agent_history[:-1]])
logger.debug(f"Agent history: {agent_history_str}")
prompt = f"""
# ROLE
You are a post processing agent responsible for rewriting a response generated by an agent, according to instructions provided below. Ensure that the response you produce adheres to the instructions provided to you (if any).
------------------------------------------------------------------------
# ADDITIONAL INSTRUCTIONS
Here are additional instructions that the admin might have configured for you:
{post_process_instructions}
------------------------------------------------------------------------
# CHAT HISTORY
Here is the chat history:
{agent_history_str}
"""
if context:
context_prompt = f"""
------------------------------------------------------------------------
# CONTEXT
Here is the context:
{context}
"""
prompt += context_prompt
if style_prompt:
style_prompt = f"""
------------------------------------------------------------------------
# STYLE PROMPT
Here is the style prompt:
{style_prompt}
"""
prompt += style_prompt
agent_response_and_instructions = f"""
------------------------------------------------------------------------
# AGENT INSTRUCTIONS
Here are the instructions to the agent generating the response:
{agent_instructions}
------------------------------------------------------------------------
# AGENT RESPONSE
Here is the response that the agent has generated:
{pending_msg['content']}
"""
prompt += agent_response_and_instructions
logger.debug(f"Sanitizing response for style. Original response: {pending_msg['content']}")
completion = generate_openai_output(
messages=[
{"role": "system", "content": prompt}
],
model = model,
return_completion=True
)
content = completion.choices[0].message.content
if content:
content = content.strip().lstrip().rstrip()
tokens_used = update_tokens_used(provider="openai", model=model, tokens_used=tokens_used, completion=completion)
logger.debug(f"Response after style check: {content}, tokens used: {tokens_used}")
pending_msg['content'] = content if content else pending_msg['content']
pending_msg['response_type'] = "external"
pending_msg['sender'] = pending_msg['sender'] + f' >> {post_processing_agent_name}'
response = Response(
messages=[pending_msg],
tokens_used=tokens_used,
agent=last_agent,
error_msg=''
)
return response