diff --git a/chatbot_ui/.vscode/launch.json b/chatbot_ui/.vscode/launch.json index 2064a252..05c7ea22 100644 --- a/chatbot_ui/.vscode/launch.json +++ b/chatbot_ui/.vscode/launch.json @@ -7,16 +7,15 @@ { "python": "${workspaceFolder}/venv/bin/python", "name": "chatbot-ui", - "cwd": "${workspaceFolder}/app", "type": "debugpy", "request": "launch", - "program": "run.py", + "program": "run_stream.py", "console": "integratedTerminal", "env": { "LLM": "1", "CHAT_COMPLETION_ENDPOINT": "http://localhost:10000/v1", "STREAMING": "True", - "ARCH_CONFIG": "../../demos/function_calling/arch_config.yaml" + "ARCH_CONFIG": "../demos/function_calling/arch_config.yaml" } }, { diff --git a/chatbot_ui/Dockerfile b/chatbot_ui/Dockerfile index 68b41f94..32101bf5 100644 --- a/chatbot_ui/Dockerfile +++ b/chatbot_ui/Dockerfile @@ -8,13 +8,11 @@ COPY requirements.txt /src/ RUN pip install --prefix=/runtime --force-reinstall -r requirements.txt -COPY . /src - FROM python:3.10-slim AS output COPY --from=builder /runtime /usr/local -COPY /app /app WORKDIR /app +COPY *.py . -CMD ["python", "run.py"] +CMD ["python", "run_stream.py"] diff --git a/chatbot_ui/app/arch_util.py b/chatbot_ui/app/arch_util.py deleted file mode 100644 index 567640e5..00000000 --- a/chatbot_ui/app/arch_util.py +++ /dev/null @@ -1,20 +0,0 @@ -import json - - -ARCH_STATE_HEADER = "x-arch-state" - - -def get_arch_messages(response_json): - arch_messages = [] - if response_json and "metadata" in response_json: - # load arch_state from metadata - arch_state_str = response_json.get("metadata", {}).get(ARCH_STATE_HEADER, "{}") - # parse arch_state into json object - arch_state = json.loads(arch_state_str) - # load messages from arch_state - arch_messages_str = arch_state.get("messages", "[]") - # parse messages into json object - arch_messages = json.loads(arch_messages_str) - # append messages from arch gateway to history - return arch_messages - return [] diff --git a/chatbot_ui/app/run.py b/chatbot_ui/app/run.py deleted file mode 100644 index b0d5acc6..00000000 --- a/chatbot_ui/app/run.py +++ /dev/null @@ -1,231 +0,0 @@ -import json -import os -import logging -import yaml -from arch_util import get_arch_messages -import gradio as gr - -from typing import List, Optional, Tuple -from openai import OpenAI -from dotenv import load_dotenv - -load_dotenv() - -STREAM_RESPONSE = bool(os.getenv("STREAM_RESPOSE", True)) - -logging.basicConfig( - level=logging.INFO, - format="%(asctime)s - %(levelname)s - %(message)s", -) - -log = logging.getLogger(__name__) - -CHAT_COMPLETION_ENDPOINT = os.getenv("CHAT_COMPLETION_ENDPOINT") -log.info(f"CHAT_COMPLETION_ENDPOINT: {CHAT_COMPLETION_ENDPOINT}") - - -CSS_STYLE = """ -.json-container { - height: 95vh !important; - overflow-y: auto !important; -} -.chatbot { - height: calc(95vh - 100px) !important; - overflow-y: auto !important; -} -footer {visibility: hidden} -""" - -client = OpenAI( - api_key="--", - base_url=CHAT_COMPLETION_ENDPOINT, - # http_client=DefaultHttpxClient(headers={"accept-encoding": "*"}), -) - - -def convert_prompt_target_to_openai_format(target): - tool = { - "description": target["description"], - "parameters": {"type": "object", "properties": {}, "required": []}, - } - - if "parameters" in target: - for param_info in target["parameters"]: - parameter = { - "type": param_info["type"], - "description": param_info["description"], - } - - for key in ["default", "format", "enum", "items", "minimum", "maximum"]: - if key in param_info: - parameter[key] = param_info[key] - - tool["parameters"]["properties"][param_info["name"]] = parameter - - required = param_info.get("required", False) - if required: - tool["parameters"]["required"].append(param_info["name"]) - - return {"name": target["name"], "info": tool} - - -def get_prompt_targets(): - try: - with open(os.getenv("ARCH_CONFIG", "arch_config.yaml"), "r") as file: - config = yaml.safe_load(file) - - available_tools = [] - for target in config["prompt_targets"]: - if not target.get("default", False): - available_tools.append( - convert_prompt_target_to_openai_format(target) - ) - - return {tool["name"]: tool["info"] for tool in available_tools} - except Exception as e: - log.info(e) - return None - - -def chat(query: Optional[str], conversation: Optional[List[Tuple[str, str]]], state): - if "history" not in state: - state["history"] = [] - - history = state.get("history") - history.append({"role": "user", "content": query}) - log.info(f"history: {history}") - - # Custom headers - custom_headers = { - "x-arch-deterministic-provider": "openai", - } - - try: - raw_response = client.chat.completions.with_raw_response.create( - model="--", - messages=history, - temperature=1.0, - # metadata=metadata, - extra_headers=custom_headers, - stream=STREAM_RESPONSE, - ) - except Exception as e: - log.info(e) - # remove last user message in case of exception - history.pop() - log.info("Error calling gateway API: {}".format(e)) - raise gr.Error("Error calling gateway API: {}".format(e)) - - if STREAM_RESPONSE: - response = raw_response.parse() - history.append({"role": "assistant", "content": "", "model": ""}) - # for gradio UI we don't want to show raw tool calls and messages from developer application - # so we're filtering those out - history_view = [h for h in history if h["role"] != "tool" and "content" in h] - - messages = [ - (history_view[i]["content"], history_view[i + 1]["content"]) - for i in range(0, len(history_view) - 1, 2) - ] - - for chunk in response: - if len(chunk.choices) > 0: - if chunk.choices[0].delta.role: - if history[-1]["role"] != chunk.choices[0].delta.role: - history.append( - { - "role": chunk.choices[0].delta.role, - "content": chunk.choices[0].delta.content, - "model": chunk.model, - "tool_calls": chunk.choices[0].delta.tool_calls, - } - ) - - history[-1]["model"] = chunk.model - if chunk.choices[0].delta.content: - if not history[-1]["content"]: - history[-1]["content"] = "" - history[-1]["content"] = ( - history[-1]["content"] + chunk.choices[0].delta.content - ) - if chunk.choices[0].delta.tool_calls: - history[-1]["tool_calls"] = chunk.choices[0].delta.tool_calls - - if chunk.model and chunk.choices[0].delta.content: - messages[-1] = ( - messages[-1][0], - messages[-1][1] + chunk.choices[0].delta.content, - ) - yield "", messages, state - else: - log.error(f"raw_response: {raw_response.text}") - response = raw_response.parse() - - # extract arch_state from metadata and store it in gradio session state - # this state must be passed back to the gateway in the next request - response_json = json.loads(raw_response.text) - log.info(response_json) - - arch_messages = get_arch_messages(response_json) - for arch_message in arch_messages: - history.append(arch_message) - - content = response.choices[0].message.content - - history.append( - {"role": "assistant", "content": content, "model": response.model} - ) - - # for gradio UI we don't want to show raw tool calls and messages from developer application - # so we're filtering those out - history_view = [h for h in history if h["role"] != "tool" and "content" in h] - - messages = [ - (history_view[i]["content"], history_view[i + 1]["content"]) - for i in range(0, len(history_view) - 1, 2) - ] - - yield "", messages, state - - -def main(): - with gr.Blocks( - theme=gr.themes.Default( - font_mono=[gr.themes.GoogleFont("IBM Plex Mono"), "Arial", "sans-serif"] - ), - fill_height=True, - css=CSS_STYLE, - ) as demo: - with gr.Row(equal_height=True): - state = gr.State({}) - - with gr.Column(scale=4): - gr.JSON( - value=get_prompt_targets(), - open=True, - show_indices=False, - label="Available Tools", - scale=1, - min_height="95vh", - elem_classes="json-container", - ) - with gr.Column(scale=6): - chatbot = gr.Chatbot( - label="Arch Chatbot", - scale=1, - elem_classes="chatbot", - ) - textbox = gr.Textbox( - show_label=False, - placeholder="Enter text and press enter", - scale=1, - autofocus=True, - ) - - textbox.submit(chat, [textbox, chatbot, state], [textbox, chatbot, state]) - - demo.launch(server_name="0.0.0.0", server_port=8080, show_error=True, debug=True) - - -if __name__ == "__main__": - main() diff --git a/chatbot_ui/common.py b/chatbot_ui/common.py new file mode 100644 index 00000000..3fd5c265 --- /dev/null +++ b/chatbot_ui/common.py @@ -0,0 +1,77 @@ +import json +import logging +import os +import yaml + +logging.basicConfig( + level=logging.INFO, + format="%(asctime)s - %(levelname)s - %(message)s", +) + +log = logging.getLogger(__name__) + + +def process_stream_chunk(chunk, history): + delta = chunk.choices[0].delta + if delta.role and delta.role != history[-1]["role"]: + # create new history item if role changes + # this is likely due to arch tool call and api response + history.append({"role": delta.role}) + + history[-1]["model"] = chunk.model + # append tool calls to history if there are any in the chunk + if delta.tool_calls: + history[-1]["tool_calls"] = delta.tool_calls + + if delta.content: + # append content to the last history item + history[-1]["content"] = history[-1].get("content", "") + delta.content + # yield content if it is from assistant + if history[-1]["role"] == "assistant": + return delta.content + + return None + + +def convert_prompt_target_to_openai_format(target): + tool = { + "description": target["description"], + "parameters": {"type": "object", "properties": {}, "required": []}, + } + + if "parameters" in target: + for param_info in target["parameters"]: + parameter = { + "type": param_info["type"], + "description": param_info["description"], + } + + for key in ["default", "format", "enum", "items", "minimum", "maximum"]: + if key in param_info: + parameter[key] = param_info[key] + + tool["parameters"]["properties"][param_info["name"]] = parameter + + required = param_info.get("required", False) + if required: + tool["parameters"]["required"].append(param_info["name"]) + + return {"name": target["name"], "info": tool} + + +def get_prompt_targets(): + try: + with open(os.getenv("ARCH_CONFIG", "arch_config.yaml"), "r") as file: + config = yaml.safe_load(file) + + available_tools = [] + for target in config["prompt_targets"]: + if not target.get("default", False): + available_tools.append( + convert_prompt_target_to_openai_format(target) + ) + + return {tool["name"]: tool["info"] for tool in available_tools} + except Exception as e: + log.info(e) + return None diff --git a/chatbot_ui/run_stream.py b/chatbot_ui/run_stream.py new file mode 100644 index 00000000..bd4eab56 --- /dev/null +++ b/chatbot_ui/run_stream.py @@ -0,0 +1,120 @@ +import json +import os +import logging +import yaml +import gradio as gr + +from typing import List, Optional, Tuple +from openai import OpenAI +from dotenv import load_dotenv + +from common import get_prompt_targets, process_stream_chunk + +load_dotenv() + + +logging.basicConfig( + level=logging.INFO, + format="%(asctime)s - %(levelname)s - %(message)s", +) + +log = logging.getLogger(__name__) + +CHAT_COMPLETION_ENDPOINT = os.getenv("CHAT_COMPLETION_ENDPOINT") +log.info(f"CHAT_COMPLETION_ENDPOINT: {CHAT_COMPLETION_ENDPOINT}") + + +CSS_STYLE = """ +.json-container { + height: 95vh !important; + overflow-y: auto !important; +} +.chatbot { + height: calc(95vh - 100px) !important; + overflow-y: auto !important; +} +footer {visibility: hidden} +""" + +client = OpenAI( + api_key="--", + base_url=CHAT_COMPLETION_ENDPOINT, +) + + +def chat( + query: Optional[str], + conversation: Optional[List[Tuple[str, str]]], + history: List[dict], +): + history.append({"role": "user", "content": query}) + + try: + response = client.chat.completions.create( + # we select model from arch_config file + model="--", + messages=history, + temperature=1.0, + stream=True, + ) + except Exception as e: + # remove last user message in case of exception + history.pop() + log.info("Error calling gateway API: {}".format(e)) + raise gr.Error("Error calling gateway API: {}".format(e)) + + conversation.append((query, "")) + + for chunk in response: + tokens = process_stream_chunk(chunk, history) + if tokens: + conversation[-1] = ( + conversation[-1][0], + conversation[-1][1] + tokens, + ) + + yield "", conversation, history + + +def main(): + with gr.Blocks( + theme=gr.themes.Default( + font_mono=[gr.themes.GoogleFont("IBM Plex Mono"), "Arial", "sans-serif"] + ), + fill_height=True, + css=CSS_STYLE, + ) as demo: + with gr.Row(equal_height=True): + history = gr.State([]) + + with gr.Column(scale=1): + with gr.Accordion("See available tools", open=False): + with gr.Column(scale=1): + gr.JSON( + value=get_prompt_targets(), + show_indices=False, + elem_classes="json-container", + min_height="95vh", + ) + + with gr.Column(scale=2): + chatbot = gr.Chatbot( + label="Arch Chatbot", + elem_classes="chatbot", + ) + textbox = gr.Textbox( + show_label=False, + placeholder="Enter text and press enter", + autofocus=True, + elem_classes="textbox", + ) + + textbox.submit( + chat, [textbox, chatbot, history], [textbox, chatbot, history] + ) + + demo.launch(server_name="0.0.0.0", server_port=8080, show_error=True, debug=True) + + +if __name__ == "__main__": + main() diff --git a/crates/prompt_gateway/src/stream_context.rs b/crates/prompt_gateway/src/stream_context.rs index 5d79d181..4bbd3fa6 100644 --- a/crates/prompt_gateway/src/stream_context.rs +++ b/crates/prompt_gateway/src/stream_context.rs @@ -900,7 +900,11 @@ impl StreamContext { // don't send tools message and api response to chat gpt for m in callout_context.request_body.messages.iter() { - if m.role == TOOL_ROLE || m.content.is_none() { + // don't send api response and tool calls to upstream LLMs + if m.role == TOOL_ROLE + || m.content.is_none() + || (m.tool_calls.is_some() && !m.tool_calls.as_ref().unwrap().is_empty()) + { continue; } messages.push(m.clone()); diff --git a/demos/function_calling/api_server/app/main.py b/demos/function_calling/api_server/app/main.py index e87a3a21..a69c75d1 100644 --- a/demos/function_calling/api_server/app/main.py +++ b/demos/function_calling/api_server/app/main.py @@ -71,7 +71,7 @@ class DefaultTargetRequest(BaseModel): @app.post("/default_target") async def default_target(req: DefaultTargetRequest, res: Response): - logger.info(f"Received arch_messages: {req.messages}") + logger.info(f"Received messages: {req.messages}") resp = { "choices": [ { diff --git a/model_server/app/main.py b/model_server/app/main.py index 93d6217b..a8d312d7 100644 --- a/model_server/app/main.py +++ b/model_server/app/main.py @@ -186,8 +186,8 @@ async def hallucination(req: HallucinationRequest, res: Response): start_time = time.perf_counter() classifier = zero_shot_model["pipeline"] - if "arch_messages" in req.parameters: - req.parameters.pop("arch_messages") + if "messages" in req.parameters: + req.parameters.pop("messages") candidate_labels = {f"{k} is {v}": k for k, v in req.parameters.items()}