address cmt

2026-06-17 15:25:17 +02:00 · 2024-11-07 11:15:03 -08:00 · 2024-11-07 11:15:03 -08:00 · 1f383eafc4
commit 1f383eafc4
parent dd07ba2cd0
4 changed files with 27 additions and 23 deletions
--- a/model_server/app/commons/constants.py
+++ b/model_server/app/commons/constants.py
@ -8,8 +8,9 @@ from app.prompt_guard.model_handler import ArchGuardHanlder
 logger = utils.get_model_server_logger()

 arch_function_hanlder = ArchFunctionHandler()
-prefill_list = ["May", "Could", "Sure", "Definitely", "Certainly", "Of course", "Can"]
-prefill_enabled = True
+PREFILL_LIST = ["May", "Could", "Sure", "Definitely", "Certainly", "Of course", "Can"]
+PREFILL_ENABLED = True
+TOOL_CALL_TOKEN = "<tool_call>"
 arch_function_endpoint = "https://api.fc.archgw.com/v1"
 arch_function_client = utils.get_client(arch_function_endpoint)
 arch_function_generation_params = {
--- a/model_server/app/function_calling/model_utils.py
+++ b/model_server/app/function_calling/model_utils.py
@ -87,14 +87,14 @@ async def chat_completion(req: ChatMessage, res: Response):
        resp = const.arch_function_client.chat.completions.create(
            messages=messages,
            model=client_model_name,
-            stream=const.prefill_enabled,
+            stream=const.PREFILL_ENABLED,
            extra_body=const.arch_function_generation_params,
        )
    except Exception as e:
        logger.error(f"model_server <= arch_function: error: {e}")
        raise

-    if const.prefill_enabled:
+    if const.PREFILL_ENABLED:
        first_token_content = ""
        for token in resp:
            first_token_content = token.choices[
@ -104,14 +104,16 @@ async def chat_completion(req: ChatMessage, res: Response):
                break

        # Check if the first token requires tool call handling
-        if first_token_content != "<tool_call>":
+        if first_token_content != const.TOOL_CALL_TOKEN:
            # Engage pre-filling response if no tool call is indicated
            resp.close()
            logger.info("Tool call is not found! Engage pre filling")
-            prefill_content = random.choice(const.prefill_list)
+            prefill_content = random.choice(const.PREFILL_LIST)
            messages.append({"role": "assistant", "content": prefill_content})

            # Send a new completion request with the updated messages
+            # the model will continue the final message in the chat instead of starting a new one
+            # disable add_generation_prompt which tells the template to add tokens that indicate the start of a bot response.
            extra_body = {
                **const.arch_function_generation_params,
                "continue_final_message": True,