add prefill and test (#236)

* add prefill and test * fix stream * fix * feedback * address comments * update * add e2e test * fix e2e test * update fix * fix * address cmt * address cmt
2026-06-08 14:55:14 +02:00 · 2024-11-07 11:59:29 -08:00 · 2024-11-07 11:59:29 -08:00 · fb67788be0
commit fb67788be0
parent f48489f7c0
7 changed files with 216 additions and 19 deletions
--- a/e2e_tests/.vscode/launch.json
+++ b/e2e_tests/.vscode/launch.json
@ -0,0 +1,15 @@
+{
+    // Use IntelliSense to learn about possible attributes.
+    // Hover to view descriptions of existing attributes.
+    // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
+    "version": "0.2.0",
+    "configurations": [
+        {
+            "name": "Python Debugger: Current File",
+            "type": "debugpy",
+            "request": "launch",
+            "program": "${file}",
+            "console": "integratedTerminal"
+        }
+    ]
+}
--- a/e2e_tests/common.py
+++ b/e2e_tests/common.py
@ -10,6 +10,16 @@ LLM_GATEWAY_ENDPOINT = os.getenv(
 )
 ARCH_STATE_HEADER = "x-arch-state"

+PREFILL_LIST = [
+    "May",
+    "Could",
+    "Sure",
+    "Definitely",
+    "Certainly",
+    "Of course",
+    "Can",
+]
+

 def get_data_chunks(stream, n=1):
    chunks = []
--- a/e2e_tests/test_prompt_gateway.py
+++ b/e2e_tests/test_prompt_gateway.py
@ -3,7 +3,12 @@ import pytest
 import requests
 from deepdiff import DeepDiff

-from common import PROMPT_GATEWAY_ENDPOINT, get_arch_messages, get_data_chunks
+from common import (
+    PROMPT_GATEWAY_ENDPOINT,
+    PREFILL_LIST,
+    get_arch_messages,
+    get_data_chunks,
+)


@pytest.mark.parametrize("stream", [True, False])
@ -101,13 +106,21 @@ def test_prompt_gateway_arch_direct_response(stream):
        assert len(choices) > 0
        tool_calls = choices[0].get("delta", {}).get("tool_calls", [])
        assert len(tool_calls) == 0
+        response_json = json.loads(chunks[1])
+        choices = response_json.get("choices", [])
+        assert len(choices) > 0
+        message = choices[0]["delta"]["content"]
    else:
        response_json = response.json()
        assert response_json.get("model").startswith("Arch")
        choices = response_json.get("choices", [])
        assert len(choices) > 0
        message = choices[0]["message"]["content"]
+
        assert "Could you provide the following details days" not in message
+    assert any(
+        message.startswith(word) for word in PREFILL_LIST
+    ), f"Expected assistant message to start with one of {PREFILL_LIST}, but got '{assistant_message}'"


@pytest.mark.parametrize("stream", [True, False])
--- a/model_server/app/commons/constants.py
+++ b/model_server/app/commons/constants.py
@ -8,6 +8,9 @@ from app.prompt_guard.model_handler import ArchGuardHanlder
 logger = utils.get_model_server_logger()

 arch_function_hanlder = ArchFunctionHandler()
+PREFILL_LIST = ["May", "Could", "Sure", "Definitely", "Certainly", "Of course", "Can"]
+PREFILL_ENABLED = True
+TOOL_CALL_TOKEN = "<tool_call>"
 arch_function_endpoint = "https://api.fc.archgw.com/v1"
 arch_function_client = utils.get_client(arch_function_endpoint)
 arch_function_generation_params = {
--- a/model_server/app/function_calling/model_utils.py
+++ b/model_server/app/function_calling/model_utils.py
@ -1,21 +1,21 @@
 import json
 import hashlib
 import app.commons.constants as const
-
+import random
 from fastapi import Response
 from pydantic import BaseModel
 from app.commons.utilities import get_model_server_logger
-from typing import Any, Dict, List
+from typing import Any, Dict, List, Optional


 logger = get_model_server_logger()


 class Message(BaseModel):
-    role: str
-    content: str = ""
-    tool_calls: List[Dict[str, Any]] = []
-    tool_call_id: str = ""
+    role: Optional[str] = ""
+    content: Optional[str] = ""
+    tool_calls: Optional[List[Dict[str, Any]]] = []
+    tool_call_id: Optional[str] = ""


 class ChatMessage(BaseModel):
@ -23,6 +23,20 @@ class ChatMessage(BaseModel):
    tools: List[Dict[str, Any]]


+class Choice(BaseModel):
+    message: Message
+    finish_reason: Optional[str] = "stop"
+    index: Optional[int] = 0
+
+
+class ChatCompletionResponse(BaseModel):
+    choices: List[Choice]
+    model: Optional[str] = "Arch-Function"
+    created: Optional[str] = ""
+    id: Optional[str] = ""
+    object: Optional[str] = "chat_completion"
+
+
 def process_messages(history: list[Message]):
    updated_history = []
    for hist in history:
@ -67,30 +81,77 @@ async def chat_completion(req: ChatMessage, res: Response):
        f"model_server => arch_function: {client_model_name}, messages: {json.dumps(messages)}"
    )

+    # Retrieve the first token, handling the Stream object carefully
+
    try:
        resp = const.arch_function_client.chat.completions.create(
            messages=messages,
            model=client_model_name,
-            stream=False,
+            stream=const.PREFILL_ENABLED,
            extra_body=const.arch_function_generation_params,
        )
    except Exception as e:
        logger.error(f"model_server <= arch_function: error: {e}")
        raise

-    tool_calls = const.arch_function_hanlder.extract_tool_calls(
-        resp.choices[0].message.content
-    )
+    if const.PREFILL_ENABLED:
+        first_token_content = ""
+        for token in resp:
+            first_token_content = token.choices[
+                0
+            ].delta.content.strip()  # Clean up the content
+            if first_token_content:  # Break if it's non-empty
+                break
+
+        # Check if the first token requires tool call handling
+        if first_token_content != const.TOOL_CALL_TOKEN:
+            # Engage pre-filling response if no tool call is indicated
+            resp.close()
+            logger.info("Tool call is not found! Engage pre filling")
+            prefill_content = random.choice(const.PREFILL_LIST)
+            messages.append({"role": "assistant", "content": prefill_content})
+
+            # Send a new completion request with the updated messages
+            # the model will continue the final message in the chat instead of starting a new one
+            # disable add_generation_prompt which tells the template to add tokens that indicate the start of a bot response.
+            extra_body = {
+                **const.arch_function_generation_params,
+                "continue_final_message": True,
+                "add_generation_prompt": False,
+            }
+            pre_fill_resp = const.arch_function_client.chat.completions.create(
+                messages=messages,
+                model=client_model_name,
+                stream=False,
+                extra_body=extra_body,
+            )
+            full_response = pre_fill_resp.choices[0].message.content
+        else:
+            # Initialize full response and iterate over tokens to gather the full response
+            full_response = first_token_content
+            for token in resp:
+                if hasattr(token.choices[0].delta, "content"):
+                    full_response += token.choices[0].delta.content
+    else:
+        logger.info("Stream is disabled, not engaging pre-filling")
+        full_response = resp.choices[0].message.content
+
+    tool_calls = const.arch_function_hanlder.extract_tool_calls(full_response)

    if tool_calls:
-        resp.choices[0].message.tool_calls = tool_calls
-        resp.choices[0].message.content = None
+        message = Message(content="", tool_calls=tool_calls)
+    else:
+        message = Message(content=full_response, tool_calls=[])
+    choice = Choice(message=message)
+    chat_completion_response = ChatCompletionResponse(
+        choices=[choice], model=client_model_name
+    )

    logger.info(
        f"model_server <= arch_function: (tools): {json.dumps([tool_call['function'] for tool_call in tool_calls])}"
    )
    logger.info(
-        f"model_server <= arch_function: response body: {json.dumps(resp.to_dict())}"
+        f"model_server <= arch_function: response body: {json.dumps(chat_completion_response.dict())}"
    )

-    return resp
+    return chat_completion_response
--- a/model_server/app/main.py
+++ b/model_server/app/main.py
@ -6,7 +6,7 @@ import app.prompt_guard.model_utils as guard_utils

 from typing import List, Dict
 from pydantic import BaseModel
-from fastapi import FastAPI, Response, HTTPException
+from fastapi import FastAPI, Response, HTTPException, Request
 from app.function_calling.model_utils import ChatMessage

 from app.commons.constants import embedding_model, zero_shot_model, arch_guard_handler
@ -214,6 +214,11 @@ async def hallucination(req: HallucinationRequest, res: Response):


@app.post("/v1/chat/completions")
-async def chat_completion(req: ChatMessage, res: Response):
-    result = await arch_function_chat_completion(req, res)
-    return result
+async def chat_completion(req: ChatMessage, res: Response, request: Request):
+    try:
+        result = await arch_function_chat_completion(req, res)
+        return result
+    except Exception as e:
+        logger.error(f"Error in chat_completion: {e}")
+        res.status_code = 500
+        return {"error": "Internal server error"}
--- a/model_server/app/tests/test_function_calling.py
+++ b/model_server/app/tests/test_function_calling.py
@ -0,0 +1,90 @@
+import pytest
+from unittest.mock import AsyncMock, MagicMock, patch
+import app.commons.constants as const
+from fastapi import Response
+from app.function_calling.model_utils import (
+    process_messages,
+    chat_completion,
+    Message,
+    ChatMessage,
+    Choice,
+    ChatCompletionResponse,
+)
+
+
+def sample_messages():
+    # Ensure fields are explicitly set with valid data or empty values
+    return [
+        Message(role="user", content="Hello!", tool_calls=[], tool_call_id=""),
+        Message(
+            role="assistant",
+            content="",
+            tool_calls=[{"function": {"name": "sample_tool"}}],
+            tool_call_id="sample_id",
+        ),
+        Message(
+            role="tool", content="Response from tool", tool_calls=[], tool_call_id=""
+        ),
+    ]
+
+
+def sample_request(sample_messages):
+    return ChatMessage(
+        messages=sample_messages,
+        tools=[{"name": "sample_tool", "description": "A sample tool"}],
+    )
+
+
+@patch("app.commons.constants.arch_function_hanlder")
+def test_process_messages(mock_hanlder):
+    messages = sample_messages()
+    processed = process_messages(messages)
+
+    assert len(processed) == 3
+    assert processed[0] == {"role": "user", "content": "Hello!"}
+    assert processed[1] == {
+        "role": "assistant",
+        "content": '<tool_call>\n{"name": "sample_tool"}\n</tool_call>',
+    }
+    assert processed[2] == {
+        "role": "user",
+        "content": "<tool_response>\nResponse from tool\n</tool_response>",
+    }
+
+
+@patch("app.commons.constants.arch_function_client")
+@patch("app.commons.constants.arch_function_hanlder")
+@pytest.mark.asyncio
+async def test_chat_completion(mock_hanlder, mock_client):
+    # Mock the model list return for client
+    mock_client.models.list.return_value = MagicMock(
+        data=[MagicMock(id="sample_model")]
+    )
+    request = sample_request(sample_messages())
+    # Simulate stream response as list of tokens
+    mock_response = AsyncMock()
+    mock_response.__aiter__.return_value = [
+        MagicMock(choices=[MagicMock(delta=MagicMock(content="Hi there!"))]),
+        MagicMock(choices=[MagicMock(delta=MagicMock(content=""))]),  # end of stream
+    ]
+    mock_client.chat.completions.create.return_value = mock_response
+
+    # Mock the tool formatter
+    mock_hanlder._format_system.return_value = "<formatted_tools>"
+
+    response = Response()
+    chat_response = await chat_completion(request, response)
+
+    assert isinstance(chat_response, ChatCompletionResponse)
+    assert chat_response.choices[0].message.content is not None
+
+    first_call_args = mock_client.chat.completions.create.call_args_list[0][1]
+    assert first_call_args["stream"] == True
+    assert "model" in first_call_args
+    assert first_call_args["messages"][0]["content"] == "<formatted_tools>"
+
+    # Check that the arguments for the second call to 'create' include the pre-fill completion
+    second_call_args = mock_client.chat.completions.create.call_args_list[1][1]
+    assert second_call_args["stream"] == False
+    assert "model" in second_call_args
+    assert second_call_args["messages"][-1]["content"] in const.PREFILL_LIST