# Guardrails from src.utils.common import generate_llm_output import os import copy from src.swarm.types import Response, Agent from src.utils.common import common_logger, generate_openai_output, update_tokens_used logger = common_logger def classify_hallucination(context: str, assistant_response: str, chat_history: list, model: str) -> str: """ Checks if an assistant's response contains hallucinations by comparing against provided context. Args: context (str): The context/knowledge base to check the response against assistant_response (str): The response from the assistant to validate chat_history (list): List of previous chat messages for context Returns: str: Verdict indicating level of hallucination: 'yes-absolute' - completely supported by context 'yes-common-sensical' - supported with common sense interpretation 'no-absolute' - not supported by context 'no-subtle' - not supported but difference is subtle """ chat_history_str = "\n".join([f"{message['role']}: {message['content']}" for message in chat_history]) prompt = f""" You are a guardrail agent. Your job is to check if the response is hallucinating. ------------------------------------------------------------------------ Here is the context: {context} ------------------------------------------------------------------------ Here is the chat history message: {chat_history_str} ------------------------------------------------------------------------ Here is the response: {assistant_response} ------------------------------------------------------------------------ As a hallucination guardrail, your job is to go through each line of the response and check if it is completely supported by the context. Even if a single line is not supported, the response is no. Output a single verdict for the entire response. don't provide any reasoning. The output classes are yes-absolute: completely supported by the context yes-common-sensical: but with some common sense interpretation no-absolute: not supported by the context no-subtle: not supported by the context but the difference is subtle Output of of the classes: verdict : yes-absolute/yes-common-sensical/no-absolute/no-subtle Example 1: The response is completely supported by the context. User Input: Context: "Our airline provides complimentary meals and beverages on all international flights. Passengers are allowed one carry-on bag and one personal item." Chat History: User: "Do international flights with your airline offer free meals?" Response: "Yes, all international flights with our airline offer free meals and beverages." Output: verdict: yes-absolute Example 2: The response is generally true and could be deduced with common sense interpretation, though not explicitly stated in the context. User Input: Context: "Flights may experience delays due to weather conditions. In such cases, the airline staff will provide updates at the airport." Chat History: User: "Will there be announcements if my flight is delayed?" Response: "Yes, if your flight is delayed, there will be announcements at the airport." Output: verdict: yes-common-sensical Example 3: The response is not supported by the context and contains glaring inaccuracies. User Input: Context: "You can cancel your ticket online up to 24 hours before the flight's departure time and receive a full refund." Chat History: User: "Can I get a refund if I cancel 12 hours before the flight?" Response: "Yes, you can get a refund if you cancel 12 hours before the flight." Output: verdict: no-absolute Example 4: The response is not supported by the context but the difference is subtle. User Input: Context: "Our frequent flyer program offers discounts on checked bags for members who have achieved Gold status." Chat History: User: "As a member, do I get discounts on checked bags?" Response: "Yes, members of our frequent flyer program get discounts on checked bags." Output: verdict: no-subtle """ messages = [ { "role": "system", "content": prompt, }, ] response = generate_llm_output(messages, model) return response def post_process_response(messages: list, post_processing_agent_name: str, post_process_instructions: str, style_prompt: str = None, context: str = None, model: str = "gpt-4o", tokens_used: dict = {}, last_agent: Agent = None) -> dict: agent_instructions = last_agent.instructions agent_history = last_agent.history # agent_instructions = '' # agent_history = [] pending_msg = copy.deepcopy(messages[-1]) logger.debug(f"Pending message keys: {pending_msg.keys()}") skip = False if pending_msg.get("tool_calls"): logger.info("Last message is a tool call, skipping post processing and setting last message to external") skip = True elif not pending_msg['response_type'] == "internal": logger.info("Last message is not internal, skipping post processing and setting last message to external") skip = True elif not pending_msg['content']: logger.info("Last message has no content, skipping post processing and setting last message to external") skip = True elif not post_process_instructions: logger.info("No post process instructions, skipping post processing and setting last message to external") skip = True if skip: pending_msg['response_type'] = "external" response = Response( messages=[], tokens_used=tokens_used, agent=last_agent, error_msg='' ) return response agent_history_str = f"\n{'*'*100}\n".join([f"Role: {message['role']} | Content: {message.get('content', 'None')} | Tool Calls: {message.get('tool_calls', 'None')}" for message in agent_history[:-1]]) logger.debug(f"Agent history: {agent_history_str}") prompt = f""" # ROLE You are a post processing agent responsible for rewriting a response generated by an agent, according to instructions provided below. Ensure that the response you produce adheres to the instructions provided to you (if any). ------------------------------------------------------------------------ # ADDITIONAL INSTRUCTIONS Here are additional instructions that the admin might have configured for you: {post_process_instructions} ------------------------------------------------------------------------ # CHAT HISTORY Here is the chat history: {agent_history_str} """ if context: context_prompt = f""" ------------------------------------------------------------------------ # CONTEXT Here is the context: {context} """ prompt += context_prompt if style_prompt: style_prompt = f""" ------------------------------------------------------------------------ # STYLE PROMPT Here is the style prompt: {style_prompt} """ prompt += style_prompt agent_response_and_instructions = f""" ------------------------------------------------------------------------ # AGENT INSTRUCTIONS Here are the instructions to the agent generating the response: {agent_instructions} ------------------------------------------------------------------------ # AGENT RESPONSE Here is the response that the agent has generated: {pending_msg['content']} """ prompt += agent_response_and_instructions logger.debug(f"Sanitizing response for style. Original response: {pending_msg['content']}") completion = generate_openai_output( messages=[ {"role": "system", "content": prompt} ], model = model, return_completion=True ) content = completion.choices[0].message.content if content: content = content.strip().lstrip().rstrip() tokens_used = update_tokens_used(provider="openai", model=model, tokens_used=tokens_used, completion=completion) logger.debug(f"Response after style check: {content}, tokens used: {tokens_used}") pending_msg['content'] = content if content else pending_msg['content'] pending_msg['response_type'] = "external" pending_msg['sender'] = pending_msg['sender'] + f' >> {post_processing_agent_name}' response = Response( messages=[pending_msg], tokens_used=tokens_used, agent=last_agent, error_msg='' ) return response