Add support for streaming and fixes few issues (see description) (#202)

2026-07-17 16:31:04 +02:00 · 2024-10-28 20:05:06 -04:00 · 2024-10-28 20:05:06 -04:00 · 662a840ac5
commit 662a840ac5
parent 29ff8da60f
45 changed files with 2266 additions and 477 deletions
--- a/chatbot_ui/app/arch_util.py
+++ b/chatbot_ui/app/arch_util.py
@ -0,0 +1,20 @@
+import json
+
+
+ARCH_STATE_HEADER = "x-arch-state"
+
+
+def get_arch_messages(response_json):
+    arch_messages = []
+    if response_json and "metadata" in response_json:
+        # load arch_state from metadata
+        arch_state_str = response_json.get("metadata", {}).get(ARCH_STATE_HEADER, "{}")
+        # parse arch_state into json object
+        arch_state = json.loads(arch_state_str)
+        # load messages from arch_state
+        arch_messages_str = arch_state.get("messages", "[]")
+        # parse messages into json object
+        arch_messages = json.loads(arch_messages_str)
+        # append messages from arch gateway to history
+        return arch_messages
+    return []
--- a/chatbot_ui/app/run.py
+++ b/chatbot_ui/app/run.py
@ -2,14 +2,17 @@ import json
 import os
 import logging
 import yaml
+from arch_util import get_arch_messages
 import gradio as gr

 from typing import List, Optional, Tuple
-from openai import OpenAI, DefaultHttpxClient
+from openai import OpenAI
 from dotenv import load_dotenv

 load_dotenv()

+STREAM_RESPONSE = bool(os.getenv("STREAM_RESPOSE", True))
+
 logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s",
@ -20,7 +23,6 @@ log = logging.getLogger(__name__)
 CHAT_COMPLETION_ENDPOINT = os.getenv("CHAT_COMPLETION_ENDPOINT")
 log.info(f"CHAT_COMPLETION_ENDPOINT: {CHAT_COMPLETION_ENDPOINT}")

-ARCH_STATE_HEADER = "x-arch-state"

 CSS_STYLE = """
 .json-container {
@ -37,7 +39,7 @@ footer {visibility: hidden}
 client = OpenAI(
    api_key="--",
    base_url=CHAT_COMPLETION_ENDPOINT,
-    http_client=DefaultHttpxClient(headers={"accept-encoding": "*"}),
+    # http_client=DefaultHttpxClient(headers={"accept-encoding": "*"}),
 )


@ -69,7 +71,7 @@ def convert_prompt_target_to_openai_format(target):

 def get_prompt_targets():
    try:
-        with open("arch_config.yaml", "r") as file:
+        with open(os.getenv("ARCH_CONFIG", "arch_config.yaml"), "r") as file:
            config = yaml.safe_load(file)

            available_tools = []
@ -105,48 +107,85 @@ def chat(query: Optional[str], conversation: Optional[List[Tuple[str, str]]], st
            temperature=1.0,
            # metadata=metadata,
            extra_headers=custom_headers,
+            stream=STREAM_RESPONSE,
        )
    except Exception as e:
        log.info(e)
        # remove last user message in case of exception
        history.pop()
-        log.info("Error calling gateway API: {}".format(e.message))
-        raise gr.Error("Error calling gateway API: {}".format(e.message))
+        log.info("Error calling gateway API: {}".format(e))
+        raise gr.Error("Error calling gateway API: {}".format(e))

-    log.error(f"raw_response: {raw_response.text}")
-    response = raw_response.parse()
+    if STREAM_RESPONSE:
+        response = raw_response.parse()
+        history.append({"role": "assistant", "content": "", "model": ""})
+        # for gradio UI we don't want to show raw tool calls and messages from developer application
+        # so we're filtering those out
+        history_view = [h for h in history if h["role"] != "tool" and "content" in h]

-    # extract arch_state from metadata and store it in gradio session state
-    # this state must be passed back to the gateway in the next request
-    response_json = json.loads(raw_response.text)
-    log.info(response_json)
-    if response_json and "metadata" in response_json:
-        # load arch_state from metadata
-        arch_state_str = response_json.get("metadata", {}).get(ARCH_STATE_HEADER, "{}")
-        # parse arch_state into json object
-        arch_state = json.loads(arch_state_str)
-        # load messages from arch_state
-        arch_messages_str = arch_state.get("messages", "[]")
-        # parse messages into json object
-        arch_messages = json.loads(arch_messages_str)
-        # append messages from arch gateway to history
-        for message in arch_messages:
-            history.append(message)
+        messages = [
+            (history_view[i]["content"], history_view[i + 1]["content"])
+            for i in range(0, len(history_view) - 1, 2)
+        ]

-    content = response.choices[0].message.content
+        for chunk in response:
+            if len(chunk.choices) > 0:
+                if chunk.choices[0].delta.role:
+                    if history[-1]["role"] != chunk.choices[0].delta.role:
+                        history.append(
+                            {
+                                "role": chunk.choices[0].delta.role,
+                                "content": chunk.choices[0].delta.content,
+                                "model": chunk.model,
+                                "tool_calls": chunk.choices[0].delta.tool_calls,
+                            }
+                        )

-    history.append({"role": "assistant", "content": content, "model": response.model})
+                history[-1]["model"] = chunk.model
+                if chunk.choices[0].delta.content:
+                    if not history[-1]["content"]:
+                        history[-1]["content"] = ""
+                    history[-1]["content"] = (
+                        history[-1]["content"] + chunk.choices[0].delta.content
+                    )
+                if chunk.choices[0].delta.tool_calls:
+                    history[-1]["tool_calls"] = chunk.choices[0].delta.tool_calls

-    # for gradio UI we don't want to show raw tool calls and messages from developer application
-    # so we're filtering those out
-    history_view = [h for h in history if h["role"] != "tool" and "content" in h]
+                if chunk.model and chunk.choices[0].delta.content:
+                    messages[-1] = (
+                        messages[-1][0],
+                        messages[-1][1] + chunk.choices[0].delta.content,
+                    )
+                yield "", messages, state
+    else:
+        log.error(f"raw_response: {raw_response.text}")
+        response = raw_response.parse()

-    messages = [
-        (history_view[i]["content"], history_view[i + 1]["content"])
-        for i in range(0, len(history_view) - 1, 2)
-    ]
+        # extract arch_state from metadata and store it in gradio session state
+        # this state must be passed back to the gateway in the next request
+        response_json = json.loads(raw_response.text)
+        log.info(response_json)

-    return "", messages, state
+        arch_messages = get_arch_messages(response_json)
+        for arch_message in arch_messages:
+            history.append(arch_message)
+
+        content = response.choices[0].message.content
+
+        history.append(
+            {"role": "assistant", "content": content, "model": response.model}
+        )
+
+        # for gradio UI we don't want to show raw tool calls and messages from developer application
+        # so we're filtering those out
+        history_view = [h for h in history if h["role"] != "tool" and "content" in h]
+
+        messages = [
+            (history_view[i]["content"], history_view[i + 1]["content"])
+            for i in range(0, len(history_view) - 1, 2)
+        ]
+
+        yield "", messages, state


 def main():
--- a/chatbot_ui/app/run_stream.py
+++ b/chatbot_ui/app/run_stream.py
@ -1,36 +0,0 @@
-# copied from https://www.gradio.app/guides/creating-a-chatbot-fast#a-streaming-example-using-openai
-
-import os
-from openai import OpenAI
-import gradio as gr
-
-api_key = os.getenv("OPENAI_API_KEY")
-CHAT_COMPLETION_ENDPOINT = os.getenv(
-    "CHAT_COMPLETION_ENDPOINT", "https://api.openai.com/v1"
-)
-
-client = OpenAI(api_key=api_key, base_url=CHAT_COMPLETION_ENDPOINT)
-
-
-def predict(message, history):
-    history_openai_format = []
-    for human, assistant in history:
-        history_openai_format.append({"role": "user", "content": human})
-        history_openai_format.append({"role": "assistant", "content": assistant})
-    history_openai_format.append({"role": "user", "content": message})
-
-    response = client.chat.completions.create(
-        model="gpt-3.5-turbo",
-        messages=history_openai_format,
-        temperature=1.0,
-        stream=True,
-    )
-
-    partial_message = ""
-    for chunk in response:
-        if chunk.choices[0].delta.content is not None:
-            partial_message = partial_message + chunk.choices[0].delta.content
-            yield partial_message
-
-
-gr.ChatInterface(predict).launch(server_name="0.0.0.0", server_port=8081)