Improve Gradio UI and fix arch_state bug (#227)

2026-06-05 14:45:15 +02:00 · 2024-10-29 11:27:13 -07:00 · 2024-10-29 11:27:13 -07:00 · 60299244b9
commit 60299244b9
parent 662a840ac5
9 changed files with 209 additions and 262 deletions
--- a/chatbot_ui/.vscode/launch.json
+++ b/chatbot_ui/.vscode/launch.json
@ -7,16 +7,15 @@
    {
      "python": "${workspaceFolder}/venv/bin/python",
      "name": "chatbot-ui",
      "cwd": "${workspaceFolder}/app",
      "type": "debugpy",
      "request": "launch",
-      "program": "run.py",
+      "program": "run_stream.py",
      "console": "integratedTerminal",
      "env": {
        "LLM": "1",
        "CHAT_COMPLETION_ENDPOINT": "http://localhost:10000/v1",
        "STREAMING": "True",
-        "ARCH_CONFIG": "../../demos/function_calling/arch_config.yaml"
+        "ARCH_CONFIG": "../demos/function_calling/arch_config.yaml"
      }
    },
    {
--- a/chatbot_ui/Dockerfile
+++ b/chatbot_ui/Dockerfile
@ -8,13 +8,11 @@ COPY requirements.txt /src/
 RUN pip install --prefix=/runtime --force-reinstall -r requirements.txt
 COPY . /src
 FROM python:3.10-slim AS output
 COPY --from=builder /runtime /usr/local
 COPY /app /app
 WORKDIR /app
 COPY *.py .
-CMD ["python", "run.py"]
+CMD ["python", "run_stream.py"]
--- a/chatbot_ui/app/arch_util.py
+++ b/chatbot_ui/app/arch_util.py
@ -1,20 +0,0 @@
 import json
 ARCH_STATE_HEADER = "x-arch-state"
 def get_arch_messages(response_json):
    arch_messages = []
    if response_json and "metadata" in response_json:
        # load arch_state from metadata
        arch_state_str = response_json.get("metadata", {}).get(ARCH_STATE_HEADER, "{}")
        # parse arch_state into json object
        arch_state = json.loads(arch_state_str)
        # load messages from arch_state
        arch_messages_str = arch_state.get("messages", "[]")
        # parse messages into json object
        arch_messages = json.loads(arch_messages_str)
        # append messages from arch gateway to history
        return arch_messages
    return []
--- a/chatbot_ui/app/run.py
+++ b/chatbot_ui/app/run.py
@ -1,231 +0,0 @@
 import json
 import os
 import logging
 import yaml
 from arch_util import get_arch_messages
 import gradio as gr
 from typing import List, Optional, Tuple
 from openai import OpenAI
 from dotenv import load_dotenv
 load_dotenv()
 STREAM_RESPONSE = bool(os.getenv("STREAM_RESPOSE", True))
 logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s",
 )
 log = logging.getLogger(__name__)
 CHAT_COMPLETION_ENDPOINT = os.getenv("CHAT_COMPLETION_ENDPOINT")
 log.info(f"CHAT_COMPLETION_ENDPOINT: {CHAT_COMPLETION_ENDPOINT}")
 CSS_STYLE = """
 .json-container {
    height: 95vh !important;
    overflow-y: auto !important;
 }
 .chatbot {
    height: calc(95vh - 100px) !important;
    overflow-y: auto !important;
 }
 footer {visibility: hidden}
 """
 client = OpenAI(
    api_key="--",
    base_url=CHAT_COMPLETION_ENDPOINT,
    # http_client=DefaultHttpxClient(headers={"accept-encoding": "*"}),
 )
 def convert_prompt_target_to_openai_format(target):
    tool = {
        "description": target["description"],
        "parameters": {"type": "object", "properties": {}, "required": []},
    }
    if "parameters" in target:
        for param_info in target["parameters"]:
            parameter = {
                "type": param_info["type"],
                "description": param_info["description"],
            }
            for key in ["default", "format", "enum", "items", "minimum", "maximum"]:
                if key in param_info:
                    parameter[key] = param_info[key]
            tool["parameters"]["properties"][param_info["name"]] = parameter
            required = param_info.get("required", False)
            if required:
                tool["parameters"]["required"].append(param_info["name"])
    return {"name": target["name"], "info": tool}
 def get_prompt_targets():
    try:
        with open(os.getenv("ARCH_CONFIG", "arch_config.yaml"), "r") as file:
            config = yaml.safe_load(file)
            available_tools = []
            for target in config["prompt_targets"]:
                if not target.get("default", False):
                    available_tools.append(
                        convert_prompt_target_to_openai_format(target)
                    )
            return {tool["name"]: tool["info"] for tool in available_tools}
    except Exception as e:
        log.info(e)
        return None
 def chat(query: Optional[str], conversation: Optional[List[Tuple[str, str]]], state):
    if "history" not in state:
        state["history"] = []
    history = state.get("history")
    history.append({"role": "user", "content": query})
    log.info(f"history: {history}")
    # Custom headers
    custom_headers = {
        "x-arch-deterministic-provider": "openai",
    }
    try:
        raw_response = client.chat.completions.with_raw_response.create(
            model="--",
            messages=history,
            temperature=1.0,
            # metadata=metadata,
            extra_headers=custom_headers,
            stream=STREAM_RESPONSE,
        )
    except Exception as e:
        log.info(e)
        # remove last user message in case of exception
        history.pop()
        log.info("Error calling gateway API: {}".format(e))
        raise gr.Error("Error calling gateway API: {}".format(e))
    if STREAM_RESPONSE:
        response = raw_response.parse()
        history.append({"role": "assistant", "content": "", "model": ""})
        # for gradio UI we don't want to show raw tool calls and messages from developer application
        # so we're filtering those out
        history_view = [h for h in history if h["role"] != "tool" and "content" in h]
        messages = [
            (history_view[i]["content"], history_view[i + 1]["content"])
            for i in range(0, len(history_view) - 1, 2)
        ]
        for chunk in response:
            if len(chunk.choices) > 0:
                if chunk.choices[0].delta.role:
                    if history[-1]["role"] != chunk.choices[0].delta.role:
                        history.append(
                            {
                                "role": chunk.choices[0].delta.role,
                                "content": chunk.choices[0].delta.content,
                                "model": chunk.model,
                                "tool_calls": chunk.choices[0].delta.tool_calls,
                            }
                        )
                history[-1]["model"] = chunk.model
                if chunk.choices[0].delta.content:
                    if not history[-1]["content"]:
                        history[-1]["content"] = ""
                    history[-1]["content"] = (
                        history[-1]["content"] + chunk.choices[0].delta.content
                    )
                if chunk.choices[0].delta.tool_calls:
                    history[-1]["tool_calls"] = chunk.choices[0].delta.tool_calls
                if chunk.model and chunk.choices[0].delta.content:
                    messages[-1] = (
                        messages[-1][0],
                        messages[-1][1] + chunk.choices[0].delta.content,
                    )
                yield "", messages, state
    else:
        log.error(f"raw_response: {raw_response.text}")
        response = raw_response.parse()
        # extract arch_state from metadata and store it in gradio session state
        # this state must be passed back to the gateway in the next request
        response_json = json.loads(raw_response.text)
        log.info(response_json)
        arch_messages = get_arch_messages(response_json)
        for arch_message in arch_messages:
            history.append(arch_message)
        content = response.choices[0].message.content
        history.append(
            {"role": "assistant", "content": content, "model": response.model}
        )
        # for gradio UI we don't want to show raw tool calls and messages from developer application
        # so we're filtering those out
        history_view = [h for h in history if h["role"] != "tool" and "content" in h]
        messages = [
            (history_view[i]["content"], history_view[i + 1]["content"])
            for i in range(0, len(history_view) - 1, 2)
        ]
        yield "", messages, state
 def main():
    with gr.Blocks(
        theme=gr.themes.Default(
            font_mono=[gr.themes.GoogleFont("IBM Plex Mono"), "Arial", "sans-serif"]
        ),
        fill_height=True,
        css=CSS_STYLE,
    ) as demo:
        with gr.Row(equal_height=True):
            state = gr.State({})
            with gr.Column(scale=4):
                gr.JSON(
                    value=get_prompt_targets(),
                    open=True,
                    show_indices=False,
                    label="Available Tools",
                    scale=1,
                    min_height="95vh",
                    elem_classes="json-container",
                )
            with gr.Column(scale=6):
                chatbot = gr.Chatbot(
                    label="Arch Chatbot",
                    scale=1,
                    elem_classes="chatbot",
                )
                textbox = gr.Textbox(
                    show_label=False,
                    placeholder="Enter text and press enter",
                    scale=1,
                    autofocus=True,
                )
            textbox.submit(chat, [textbox, chatbot, state], [textbox, chatbot, state])
    demo.launch(server_name="0.0.0.0", server_port=8080, show_error=True, debug=True)
 if __name__ == "__main__":
    main()
--- a/chatbot_ui/common.py
+++ b/chatbot_ui/common.py
@ -0,0 +1,77 @@
 import json
 import logging
 import os
 import yaml
 logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s",
 )
 log = logging.getLogger(__name__)
 def process_stream_chunk(chunk, history):
    delta = chunk.choices[0].delta
    if delta.role and delta.role != history[-1]["role"]:
        # create new history item if role changes
        # this is likely due to arch tool call and api response
        history.append({"role": delta.role})
    history[-1]["model"] = chunk.model
    # append tool calls to history if there are any in the chunk
    if delta.tool_calls:
        history[-1]["tool_calls"] = delta.tool_calls
    if delta.content:
        # append content to the last history item
        history[-1]["content"] = history[-1].get("content", "") + delta.content
        # yield content if it is from assistant
        if history[-1]["role"] == "assistant":
            return delta.content
    return None
 def convert_prompt_target_to_openai_format(target):
    tool = {
        "description": target["description"],
        "parameters": {"type": "object", "properties": {}, "required": []},
    }
    if "parameters" in target:
        for param_info in target["parameters"]:
            parameter = {
                "type": param_info["type"],
                "description": param_info["description"],
            }
            for key in ["default", "format", "enum", "items", "minimum", "maximum"]:
                if key in param_info:
                    parameter[key] = param_info[key]
            tool["parameters"]["properties"][param_info["name"]] = parameter
            required = param_info.get("required", False)
            if required:
                tool["parameters"]["required"].append(param_info["name"])
    return {"name": target["name"], "info": tool}
 def get_prompt_targets():
    try:
        with open(os.getenv("ARCH_CONFIG", "arch_config.yaml"), "r") as file:
            config = yaml.safe_load(file)
            available_tools = []
            for target in config["prompt_targets"]:
                if not target.get("default", False):
                    available_tools.append(
                        convert_prompt_target_to_openai_format(target)
                    )
            return {tool["name"]: tool["info"] for tool in available_tools}
    except Exception as e:
        log.info(e)
        return None
--- a/chatbot_ui/run_stream.py
+++ b/chatbot_ui/run_stream.py
@ -0,0 +1,120 @@
 import json
 import os
 import logging
 import yaml
 import gradio as gr
 from typing import List, Optional, Tuple
 from openai import OpenAI
 from dotenv import load_dotenv
 from common import get_prompt_targets, process_stream_chunk
 load_dotenv()
 logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s",
 )
 log = logging.getLogger(__name__)
 CHAT_COMPLETION_ENDPOINT = os.getenv("CHAT_COMPLETION_ENDPOINT")
 log.info(f"CHAT_COMPLETION_ENDPOINT: {CHAT_COMPLETION_ENDPOINT}")
 CSS_STYLE = """
 .json-container {
    height: 95vh !important;
    overflow-y: auto !important;
 }
 .chatbot {
    height: calc(95vh - 100px) !important;
    overflow-y: auto !important;
 }
 footer {visibility: hidden}
 """
 client = OpenAI(
    api_key="--",
    base_url=CHAT_COMPLETION_ENDPOINT,
 )
 def chat(
    query: Optional[str],
    conversation: Optional[List[Tuple[str, str]]],
    history: List[dict],
 ):
    history.append({"role": "user", "content": query})
    try:
        response = client.chat.completions.create(
            # we select model from arch_config file
            model="--",
            messages=history,
            temperature=1.0,
            stream=True,
        )
    except Exception as e:
        # remove last user message in case of exception
        history.pop()
        log.info("Error calling gateway API: {}".format(e))
        raise gr.Error("Error calling gateway API: {}".format(e))
    conversation.append((query, ""))
    for chunk in response:
        tokens = process_stream_chunk(chunk, history)
        if tokens:
            conversation[-1] = (
                conversation[-1][0],
                conversation[-1][1] + tokens,
            )
            yield "", conversation, history
 def main():
    with gr.Blocks(
        theme=gr.themes.Default(
            font_mono=[gr.themes.GoogleFont("IBM Plex Mono"), "Arial", "sans-serif"]
        ),
        fill_height=True,
        css=CSS_STYLE,
    ) as demo:
        with gr.Row(equal_height=True):
            history = gr.State([])
            with gr.Column(scale=1):
                with gr.Accordion("See available tools", open=False):
                    with gr.Column(scale=1):
                        gr.JSON(
                            value=get_prompt_targets(),
                            show_indices=False,
                            elem_classes="json-container",
                            min_height="95vh",
                        )
            with gr.Column(scale=2):
                chatbot = gr.Chatbot(
                    label="Arch Chatbot",
                    elem_classes="chatbot",
                )
                textbox = gr.Textbox(
                    show_label=False,
                    placeholder="Enter text and press enter",
                    autofocus=True,
                    elem_classes="textbox",
                )
            textbox.submit(
                chat, [textbox, chatbot, history], [textbox, chatbot, history]
            )
    demo.launch(server_name="0.0.0.0", server_port=8080, show_error=True, debug=True)
 if __name__ == "__main__":
    main()
--- a/crates/prompt_gateway/src/stream_context.rs
+++ b/crates/prompt_gateway/src/stream_context.rs
@ -900,7 +900,11 @@ impl StreamContext {
        // don't send tools message and api response to chat gpt
        for m in callout_context.request_body.messages.iter() {
-            if m.role == TOOL_ROLE || m.content.is_none() {
+            // don't send api response and tool calls to upstream LLMs
            if m.role == TOOL_ROLE
                || m.content.is_none()
                || (m.tool_calls.is_some() && !m.tool_calls.as_ref().unwrap().is_empty())
            {
                continue;
            }
            messages.push(m.clone());
--- a/demos/function_calling/api_server/app/main.py
+++ b/demos/function_calling/api_server/app/main.py
@ -71,7 +71,7 @@ class DefaultTargetRequest(BaseModel):
@app.post("/default_target")
 async def default_target(req: DefaultTargetRequest, res: Response):
-    logger.info(f"Received arch_messages: {req.messages}")
+    logger.info(f"Received messages: {req.messages}")
    resp = {
        "choices": [
            {
--- a/model_server/app/main.py
+++ b/model_server/app/main.py
@ -186,8 +186,8 @@ async def hallucination(req: HallucinationRequest, res: Response):
    start_time = time.perf_counter()
    classifier = zero_shot_model["pipeline"]
-    if "arch_messages" in req.parameters:
+    if "messages" in req.parameters:
-        req.parameters.pop("arch_messages")
+        req.parameters.pop("messages")
    candidate_labels = {f"{k} is {v}": k for k, v in req.parameters.items()}