diff --git a/chatbot_ui/.vscode/launch.json b/chatbot_ui/.vscode/launch.json
index 2064a252..05c7ea22 100644
--- a/chatbot_ui/.vscode/launch.json
+++ b/chatbot_ui/.vscode/launch.json
@@ -7,16 +7,15 @@
     {
       "python": "${workspaceFolder}/venv/bin/python",
       "name": "chatbot-ui",
-      "cwd": "${workspaceFolder}/app",
       "type": "debugpy",
       "request": "launch",
-      "program": "run.py",
+      "program": "run_stream.py",
       "console": "integratedTerminal",
       "env": {
         "LLM": "1",
         "CHAT_COMPLETION_ENDPOINT": "http://localhost:10000/v1",
         "STREAMING": "True",
-        "ARCH_CONFIG": "../../demos/function_calling/arch_config.yaml"
+        "ARCH_CONFIG": "../demos/function_calling/arch_config.yaml"
       }
     },
     {
diff --git a/chatbot_ui/Dockerfile b/chatbot_ui/Dockerfile
index 68b41f94..32101bf5 100644
--- a/chatbot_ui/Dockerfile
+++ b/chatbot_ui/Dockerfile
@@ -8,13 +8,11 @@ COPY requirements.txt /src/
 
 RUN pip install --prefix=/runtime --force-reinstall -r requirements.txt
 
-COPY . /src
-
 FROM python:3.10-slim AS output
 
 COPY --from=builder /runtime /usr/local
 
-COPY /app /app
 WORKDIR /app
+COPY *.py .
 
-CMD ["python", "run.py"]
+CMD ["python", "run_stream.py"]
diff --git a/chatbot_ui/app/arch_util.py b/chatbot_ui/app/arch_util.py
deleted file mode 100644
index 567640e5..00000000
--- a/chatbot_ui/app/arch_util.py
+++ /dev/null
@@ -1,20 +0,0 @@
-import json
-
-
-ARCH_STATE_HEADER = "x-arch-state"
-
-
-def get_arch_messages(response_json):
-    arch_messages = []
-    if response_json and "metadata" in response_json:
-        # load arch_state from metadata
-        arch_state_str = response_json.get("metadata", {}).get(ARCH_STATE_HEADER, "{}")
-        # parse arch_state into json object
-        arch_state = json.loads(arch_state_str)
-        # load messages from arch_state
-        arch_messages_str = arch_state.get("messages", "[]")
-        # parse messages into json object
-        arch_messages = json.loads(arch_messages_str)
-        # append messages from arch gateway to history
-        return arch_messages
-    return []
diff --git a/chatbot_ui/app/run.py b/chatbot_ui/app/run.py
deleted file mode 100644
index b0d5acc6..00000000
--- a/chatbot_ui/app/run.py
+++ /dev/null
@@ -1,231 +0,0 @@
-import json
-import os
-import logging
-import yaml
-from arch_util import get_arch_messages
-import gradio as gr
-
-from typing import List, Optional, Tuple
-from openai import OpenAI
-from dotenv import load_dotenv
-
-load_dotenv()
-
-STREAM_RESPONSE = bool(os.getenv("STREAM_RESPOSE", True))
-
-logging.basicConfig(
-    level=logging.INFO,
-    format="%(asctime)s - %(levelname)s - %(message)s",
-)
-
-log = logging.getLogger(__name__)
-
-CHAT_COMPLETION_ENDPOINT = os.getenv("CHAT_COMPLETION_ENDPOINT")
-log.info(f"CHAT_COMPLETION_ENDPOINT: {CHAT_COMPLETION_ENDPOINT}")
-
-
-CSS_STYLE = """
-.json-container {
-    height: 95vh !important;
-    overflow-y: auto !important;
-}
-.chatbot {
-    height: calc(95vh - 100px) !important;
-    overflow-y: auto !important;
-}
-footer {visibility: hidden}
-"""
-
-client = OpenAI(
-    api_key="--",
-    base_url=CHAT_COMPLETION_ENDPOINT,
-    # http_client=DefaultHttpxClient(headers={"accept-encoding": "*"}),
-)
-
-
-def convert_prompt_target_to_openai_format(target):
-    tool = {
-        "description": target["description"],
-        "parameters": {"type": "object", "properties": {}, "required": []},
-    }
-
-    if "parameters" in target:
-        for param_info in target["parameters"]:
-            parameter = {
-                "type": param_info["type"],
-                "description": param_info["description"],
-            }
-
-            for key in ["default", "format", "enum", "items", "minimum", "maximum"]:
-                if key in param_info:
-                    parameter[key] = param_info[key]
-
-            tool["parameters"]["properties"][param_info["name"]] = parameter
-
-            required = param_info.get("required", False)
-            if required:
-                tool["parameters"]["required"].append(param_info["name"])
-
-    return {"name": target["name"], "info": tool}
-
-
-def get_prompt_targets():
-    try:
-        with open(os.getenv("ARCH_CONFIG", "arch_config.yaml"), "r") as file:
-            config = yaml.safe_load(file)
-
-            available_tools = []
-            for target in config["prompt_targets"]:
-                if not target.get("default", False):
-                    available_tools.append(
-                        convert_prompt_target_to_openai_format(target)
-                    )
-
-            return {tool["name"]: tool["info"] for tool in available_tools}
-    except Exception as e:
-        log.info(e)
-        return None
-
-
-def chat(query: Optional[str], conversation: Optional[List[Tuple[str, str]]], state):
-    if "history" not in state:
-        state["history"] = []
-
-    history = state.get("history")
-    history.append({"role": "user", "content": query})
-    log.info(f"history: {history}")
-
-    # Custom headers
-    custom_headers = {
-        "x-arch-deterministic-provider": "openai",
-    }
-
-    try:
-        raw_response = client.chat.completions.with_raw_response.create(
-            model="--",
-            messages=history,
-            temperature=1.0,
-            # metadata=metadata,
-            extra_headers=custom_headers,
-            stream=STREAM_RESPONSE,
-        )
-    except Exception as e:
-        log.info(e)
-        # remove last user message in case of exception
-        history.pop()
-        log.info("Error calling gateway API: {}".format(e))
-        raise gr.Error("Error calling gateway API: {}".format(e))
-
-    if STREAM_RESPONSE:
-        response = raw_response.parse()
-        history.append({"role": "assistant", "content": "", "model": ""})
-        # for gradio UI we don't want to show raw tool calls and messages from developer application
-        # so we're filtering those out
-        history_view = [h for h in history if h["role"] != "tool" and "content" in h]
-
-        messages = [
-            (history_view[i]["content"], history_view[i + 1]["content"])
-            for i in range(0, len(history_view) - 1, 2)
-        ]
-
-        for chunk in response:
-            if len(chunk.choices) > 0:
-                if chunk.choices[0].delta.role:
-                    if history[-1]["role"] != chunk.choices[0].delta.role:
-                        history.append(
-                            {
-                                "role": chunk.choices[0].delta.role,
-                                "content": chunk.choices[0].delta.content,
-                                "model": chunk.model,
-                                "tool_calls": chunk.choices[0].delta.tool_calls,
-                            }
-                        )
-
-                history[-1]["model"] = chunk.model
-                if chunk.choices[0].delta.content:
-                    if not history[-1]["content"]:
-                        history[-1]["content"] = ""
-                    history[-1]["content"] = (
-                        history[-1]["content"] + chunk.choices[0].delta.content
-                    )
-                if chunk.choices[0].delta.tool_calls:
-                    history[-1]["tool_calls"] = chunk.choices[0].delta.tool_calls
-
-                if chunk.model and chunk.choices[0].delta.content:
-                    messages[-1] = (
-                        messages[-1][0],
-                        messages[-1][1] + chunk.choices[0].delta.content,
-                    )
-                yield "", messages, state
-    else:
-        log.error(f"raw_response: {raw_response.text}")
-        response = raw_response.parse()
-
-        # extract arch_state from metadata and store it in gradio session state
-        # this state must be passed back to the gateway in the next request
-        response_json = json.loads(raw_response.text)
-        log.info(response_json)
-
-        arch_messages = get_arch_messages(response_json)
-        for arch_message in arch_messages:
-            history.append(arch_message)
-
-        content = response.choices[0].message.content
-
-        history.append(
-            {"role": "assistant", "content": content, "model": response.model}
-        )
-
-        # for gradio UI we don't want to show raw tool calls and messages from developer application
-        # so we're filtering those out
-        history_view = [h for h in history if h["role"] != "tool" and "content" in h]
-
-        messages = [
-            (history_view[i]["content"], history_view[i + 1]["content"])
-            for i in range(0, len(history_view) - 1, 2)
-        ]
-
-        yield "", messages, state
-
-
-def main():
-    with gr.Blocks(
-        theme=gr.themes.Default(
-            font_mono=[gr.themes.GoogleFont("IBM Plex Mono"), "Arial", "sans-serif"]
-        ),
-        fill_height=True,
-        css=CSS_STYLE,
-    ) as demo:
-        with gr.Row(equal_height=True):
-            state = gr.State({})
-
-            with gr.Column(scale=4):
-                gr.JSON(
-                    value=get_prompt_targets(),
-                    open=True,
-                    show_indices=False,
-                    label="Available Tools",
-                    scale=1,
-                    min_height="95vh",
-                    elem_classes="json-container",
-                )
-            with gr.Column(scale=6):
-                chatbot = gr.Chatbot(
-                    label="Arch Chatbot",
-                    scale=1,
-                    elem_classes="chatbot",
-                )
-                textbox = gr.Textbox(
-                    show_label=False,
-                    placeholder="Enter text and press enter",
-                    scale=1,
-                    autofocus=True,
-                )
-
-            textbox.submit(chat, [textbox, chatbot, state], [textbox, chatbot, state])
-
-    demo.launch(server_name="0.0.0.0", server_port=8080, show_error=True, debug=True)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/chatbot_ui/common.py b/chatbot_ui/common.py
new file mode 100644
index 00000000..3fd5c265
--- /dev/null
+++ b/chatbot_ui/common.py
@@ -0,0 +1,77 @@
+import json
+import logging
+import os
+import yaml
+
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s - %(levelname)s - %(message)s",
+)
+
+log = logging.getLogger(__name__)
+
+
+def process_stream_chunk(chunk, history):
+    delta = chunk.choices[0].delta
+    if delta.role and delta.role != history[-1]["role"]:
+        # create new history item if role changes
+        # this is likely due to arch tool call and api response
+        history.append({"role": delta.role})
+
+    history[-1]["model"] = chunk.model
+    # append tool calls to history if there are any in the chunk
+    if delta.tool_calls:
+        history[-1]["tool_calls"] = delta.tool_calls
+
+    if delta.content:
+        # append content to the last history item
+        history[-1]["content"] = history[-1].get("content", "") + delta.content
+        # yield content if it is from assistant
+        if history[-1]["role"] == "assistant":
+            return delta.content
+
+    return None
+
+
+def convert_prompt_target_to_openai_format(target):
+    tool = {
+        "description": target["description"],
+        "parameters": {"type": "object", "properties": {}, "required": []},
+    }
+
+    if "parameters" in target:
+        for param_info in target["parameters"]:
+            parameter = {
+                "type": param_info["type"],
+                "description": param_info["description"],
+            }
+
+            for key in ["default", "format", "enum", "items", "minimum", "maximum"]:
+                if key in param_info:
+                    parameter[key] = param_info[key]
+
+            tool["parameters"]["properties"][param_info["name"]] = parameter
+
+            required = param_info.get("required", False)
+            if required:
+                tool["parameters"]["required"].append(param_info["name"])
+
+    return {"name": target["name"], "info": tool}
+
+
+def get_prompt_targets():
+    try:
+        with open(os.getenv("ARCH_CONFIG", "arch_config.yaml"), "r") as file:
+            config = yaml.safe_load(file)
+
+            available_tools = []
+            for target in config["prompt_targets"]:
+                if not target.get("default", False):
+                    available_tools.append(
+                        convert_prompt_target_to_openai_format(target)
+                    )
+
+            return {tool["name"]: tool["info"] for tool in available_tools}
+    except Exception as e:
+        log.info(e)
+        return None
diff --git a/chatbot_ui/run_stream.py b/chatbot_ui/run_stream.py
new file mode 100644
index 00000000..bd4eab56
--- /dev/null
+++ b/chatbot_ui/run_stream.py
@@ -0,0 +1,120 @@
+import json
+import os
+import logging
+import yaml
+import gradio as gr
+
+from typing import List, Optional, Tuple
+from openai import OpenAI
+from dotenv import load_dotenv
+
+from common import get_prompt_targets, process_stream_chunk
+
+load_dotenv()
+
+
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s - %(levelname)s - %(message)s",
+)
+
+log = logging.getLogger(__name__)
+
+CHAT_COMPLETION_ENDPOINT = os.getenv("CHAT_COMPLETION_ENDPOINT")
+log.info(f"CHAT_COMPLETION_ENDPOINT: {CHAT_COMPLETION_ENDPOINT}")
+
+
+CSS_STYLE = """
+.json-container {
+    height: 95vh !important;
+    overflow-y: auto !important;
+}
+.chatbot {
+    height: calc(95vh - 100px) !important;
+    overflow-y: auto !important;
+}
+footer {visibility: hidden}
+"""
+
+client = OpenAI(
+    api_key="--",
+    base_url=CHAT_COMPLETION_ENDPOINT,
+)
+
+
+def chat(
+    query: Optional[str],
+    conversation: Optional[List[Tuple[str, str]]],
+    history: List[dict],
+):
+    history.append({"role": "user", "content": query})
+
+    try:
+        response = client.chat.completions.create(
+            # we select model from arch_config file
+            model="--",
+            messages=history,
+            temperature=1.0,
+            stream=True,
+        )
+    except Exception as e:
+        # remove last user message in case of exception
+        history.pop()
+        log.info("Error calling gateway API: {}".format(e))
+        raise gr.Error("Error calling gateway API: {}".format(e))
+
+    conversation.append((query, ""))
+
+    for chunk in response:
+        tokens = process_stream_chunk(chunk, history)
+        if tokens:
+            conversation[-1] = (
+                conversation[-1][0],
+                conversation[-1][1] + tokens,
+            )
+
+            yield "", conversation, history
+
+
+def main():
+    with gr.Blocks(
+        theme=gr.themes.Default(
+            font_mono=[gr.themes.GoogleFont("IBM Plex Mono"), "Arial", "sans-serif"]
+        ),
+        fill_height=True,
+        css=CSS_STYLE,
+    ) as demo:
+        with gr.Row(equal_height=True):
+            history = gr.State([])
+
+            with gr.Column(scale=1):
+                with gr.Accordion("See available tools", open=False):
+                    with gr.Column(scale=1):
+                        gr.JSON(
+                            value=get_prompt_targets(),
+                            show_indices=False,
+                            elem_classes="json-container",
+                            min_height="95vh",
+                        )
+
+            with gr.Column(scale=2):
+                chatbot = gr.Chatbot(
+                    label="Arch Chatbot",
+                    elem_classes="chatbot",
+                )
+                textbox = gr.Textbox(
+                    show_label=False,
+                    placeholder="Enter text and press enter",
+                    autofocus=True,
+                    elem_classes="textbox",
+                )
+
+            textbox.submit(
+                chat, [textbox, chatbot, history], [textbox, chatbot, history]
+            )
+
+    demo.launch(server_name="0.0.0.0", server_port=8080, show_error=True, debug=True)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/crates/prompt_gateway/src/stream_context.rs b/crates/prompt_gateway/src/stream_context.rs
index 5d79d181..4bbd3fa6 100644
--- a/crates/prompt_gateway/src/stream_context.rs
+++ b/crates/prompt_gateway/src/stream_context.rs
@@ -900,7 +900,11 @@ impl StreamContext {
 
         // don't send tools message and api response to chat gpt
         for m in callout_context.request_body.messages.iter() {
-            if m.role == TOOL_ROLE || m.content.is_none() {
+            // don't send api response and tool calls to upstream LLMs
+            if m.role == TOOL_ROLE
+                || m.content.is_none()
+                || (m.tool_calls.is_some() && !m.tool_calls.as_ref().unwrap().is_empty())
+            {
                 continue;
             }
             messages.push(m.clone());
diff --git a/demos/function_calling/api_server/app/main.py b/demos/function_calling/api_server/app/main.py
index e87a3a21..a69c75d1 100644
--- a/demos/function_calling/api_server/app/main.py
+++ b/demos/function_calling/api_server/app/main.py
@@ -71,7 +71,7 @@ class DefaultTargetRequest(BaseModel):
 
 @app.post("/default_target")
 async def default_target(req: DefaultTargetRequest, res: Response):
-    logger.info(f"Received arch_messages: {req.messages}")
+    logger.info(f"Received messages: {req.messages}")
     resp = {
         "choices": [
             {
diff --git a/model_server/app/main.py b/model_server/app/main.py
index 93d6217b..a8d312d7 100644
--- a/model_server/app/main.py
+++ b/model_server/app/main.py
@@ -186,8 +186,8 @@ async def hallucination(req: HallucinationRequest, res: Response):
     start_time = time.perf_counter()
     classifier = zero_shot_model["pipeline"]
 
-    if "arch_messages" in req.parameters:
-        req.parameters.pop("arch_messages")
+    if "messages" in req.parameters:
+        req.parameters.pop("messages")
 
     candidate_labels = {f"{k} is {v}": k for k, v in req.parameters.items()}