Add support for streaming and fixes few issues (see description) (#202)

This commit is contained in:
José Ulises Niño Rivera 2024-10-28 20:05:06 -04:00 committed by GitHub
parent 29ff8da60f
commit 662a840ac5
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
45 changed files with 2266 additions and 477 deletions

View file

@ -0,0 +1,20 @@
import json
ARCH_STATE_HEADER = "x-arch-state"
def get_arch_messages(response_json):
arch_messages = []
if response_json and "metadata" in response_json:
# load arch_state from metadata
arch_state_str = response_json.get("metadata", {}).get(ARCH_STATE_HEADER, "{}")
# parse arch_state into json object
arch_state = json.loads(arch_state_str)
# load messages from arch_state
arch_messages_str = arch_state.get("messages", "[]")
# parse messages into json object
arch_messages = json.loads(arch_messages_str)
# append messages from arch gateway to history
return arch_messages
return []

View file

@ -2,14 +2,17 @@ import json
import os
import logging
import yaml
from arch_util import get_arch_messages
import gradio as gr
from typing import List, Optional, Tuple
from openai import OpenAI, DefaultHttpxClient
from openai import OpenAI
from dotenv import load_dotenv
load_dotenv()
STREAM_RESPONSE = bool(os.getenv("STREAM_RESPOSE", True))
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s - %(levelname)s - %(message)s",
@ -20,7 +23,6 @@ log = logging.getLogger(__name__)
CHAT_COMPLETION_ENDPOINT = os.getenv("CHAT_COMPLETION_ENDPOINT")
log.info(f"CHAT_COMPLETION_ENDPOINT: {CHAT_COMPLETION_ENDPOINT}")
ARCH_STATE_HEADER = "x-arch-state"
CSS_STYLE = """
.json-container {
@ -37,7 +39,7 @@ footer {visibility: hidden}
client = OpenAI(
api_key="--",
base_url=CHAT_COMPLETION_ENDPOINT,
http_client=DefaultHttpxClient(headers={"accept-encoding": "*"}),
# http_client=DefaultHttpxClient(headers={"accept-encoding": "*"}),
)
@ -69,7 +71,7 @@ def convert_prompt_target_to_openai_format(target):
def get_prompt_targets():
try:
with open("arch_config.yaml", "r") as file:
with open(os.getenv("ARCH_CONFIG", "arch_config.yaml"), "r") as file:
config = yaml.safe_load(file)
available_tools = []
@ -105,48 +107,85 @@ def chat(query: Optional[str], conversation: Optional[List[Tuple[str, str]]], st
temperature=1.0,
# metadata=metadata,
extra_headers=custom_headers,
stream=STREAM_RESPONSE,
)
except Exception as e:
log.info(e)
# remove last user message in case of exception
history.pop()
log.info("Error calling gateway API: {}".format(e.message))
raise gr.Error("Error calling gateway API: {}".format(e.message))
log.info("Error calling gateway API: {}".format(e))
raise gr.Error("Error calling gateway API: {}".format(e))
log.error(f"raw_response: {raw_response.text}")
response = raw_response.parse()
if STREAM_RESPONSE:
response = raw_response.parse()
history.append({"role": "assistant", "content": "", "model": ""})
# for gradio UI we don't want to show raw tool calls and messages from developer application
# so we're filtering those out
history_view = [h for h in history if h["role"] != "tool" and "content" in h]
# extract arch_state from metadata and store it in gradio session state
# this state must be passed back to the gateway in the next request
response_json = json.loads(raw_response.text)
log.info(response_json)
if response_json and "metadata" in response_json:
# load arch_state from metadata
arch_state_str = response_json.get("metadata", {}).get(ARCH_STATE_HEADER, "{}")
# parse arch_state into json object
arch_state = json.loads(arch_state_str)
# load messages from arch_state
arch_messages_str = arch_state.get("messages", "[]")
# parse messages into json object
arch_messages = json.loads(arch_messages_str)
# append messages from arch gateway to history
for message in arch_messages:
history.append(message)
messages = [
(history_view[i]["content"], history_view[i + 1]["content"])
for i in range(0, len(history_view) - 1, 2)
]
content = response.choices[0].message.content
for chunk in response:
if len(chunk.choices) > 0:
if chunk.choices[0].delta.role:
if history[-1]["role"] != chunk.choices[0].delta.role:
history.append(
{
"role": chunk.choices[0].delta.role,
"content": chunk.choices[0].delta.content,
"model": chunk.model,
"tool_calls": chunk.choices[0].delta.tool_calls,
}
)
history.append({"role": "assistant", "content": content, "model": response.model})
history[-1]["model"] = chunk.model
if chunk.choices[0].delta.content:
if not history[-1]["content"]:
history[-1]["content"] = ""
history[-1]["content"] = (
history[-1]["content"] + chunk.choices[0].delta.content
)
if chunk.choices[0].delta.tool_calls:
history[-1]["tool_calls"] = chunk.choices[0].delta.tool_calls
# for gradio UI we don't want to show raw tool calls and messages from developer application
# so we're filtering those out
history_view = [h for h in history if h["role"] != "tool" and "content" in h]
if chunk.model and chunk.choices[0].delta.content:
messages[-1] = (
messages[-1][0],
messages[-1][1] + chunk.choices[0].delta.content,
)
yield "", messages, state
else:
log.error(f"raw_response: {raw_response.text}")
response = raw_response.parse()
messages = [
(history_view[i]["content"], history_view[i + 1]["content"])
for i in range(0, len(history_view) - 1, 2)
]
# extract arch_state from metadata and store it in gradio session state
# this state must be passed back to the gateway in the next request
response_json = json.loads(raw_response.text)
log.info(response_json)
return "", messages, state
arch_messages = get_arch_messages(response_json)
for arch_message in arch_messages:
history.append(arch_message)
content = response.choices[0].message.content
history.append(
{"role": "assistant", "content": content, "model": response.model}
)
# for gradio UI we don't want to show raw tool calls and messages from developer application
# so we're filtering those out
history_view = [h for h in history if h["role"] != "tool" and "content" in h]
messages = [
(history_view[i]["content"], history_view[i + 1]["content"])
for i in range(0, len(history_view) - 1, 2)
]
yield "", messages, state
def main():

View file

@ -1,36 +0,0 @@
# copied from https://www.gradio.app/guides/creating-a-chatbot-fast#a-streaming-example-using-openai
import os
from openai import OpenAI
import gradio as gr
api_key = os.getenv("OPENAI_API_KEY")
CHAT_COMPLETION_ENDPOINT = os.getenv(
"CHAT_COMPLETION_ENDPOINT", "https://api.openai.com/v1"
)
client = OpenAI(api_key=api_key, base_url=CHAT_COMPLETION_ENDPOINT)
def predict(message, history):
history_openai_format = []
for human, assistant in history:
history_openai_format.append({"role": "user", "content": human})
history_openai_format.append({"role": "assistant", "content": assistant})
history_openai_format.append({"role": "user", "content": message})
response = client.chat.completions.create(
model="gpt-3.5-turbo",
messages=history_openai_format,
temperature=1.0,
stream=True,
)
partial_message = ""
for chunk in response:
if chunk.choices[0].delta.content is not None:
partial_message = partial_message + chunk.choices[0].delta.content
yield partial_message
gr.ChatInterface(predict).launch(server_name="0.0.0.0", server_port=8081)