update fix

2026-06-20 15:28:07 +02:00 · 2024-11-06 16:16:08 -08:00 · 2024-11-06 16:16:08 -08:00 · 68c2243e83
commit 68c2243e83
parent d9c64738c7
6 changed files with 43 additions and 33 deletions
--- a/model_server/app/commons/constants.py
+++ b/model_server/app/commons/constants.py
@ -9,6 +9,7 @@ logger = utils.get_model_server_logger()

 arch_function_hanlder = ArchFunctionHandler()
 prefill_list = ["May", "Could", "Sure", "Definitely", "Certainly", "Of course", "Can"]
+prefill_enabled = True
 arch_function_endpoint = "https://api.fc.archgw.com/v1"
 arch_function_client = utils.get_client(arch_function_endpoint)
 arch_function_generation_params = {
--- a/model_server/app/function_calling/model_utils.py
+++ b/model_server/app/function_calling/model_utils.py
@ -64,9 +64,7 @@ def process_messages(history: list[Message]):
    return updated_history


-async def chat_completion(
-    req: ChatMessage, res: Response, prefill_enabled: bool = True
-):
+async def chat_completion(req: ChatMessage, res: Response):
    logger.info("starting request")

    tools_encoded = const.arch_function_hanlder._format_system(req.tools)
@ -89,14 +87,14 @@ async def chat_completion(
        resp = const.arch_function_client.chat.completions.create(
            messages=messages,
            model=client_model_name,
-            stream=prefill_enabled,
+            stream=const.prefill_enabled,
            extra_body=const.arch_function_generation_params,
        )
    except Exception as e:
        logger.error(f"model_server <= arch_function: error: {e}")
        raise

-    if prefill_enabled:
+    if const.prefill_enabled:
        first_token_content = ""
        for token in resp:
            first_token_content = token.choices[
--- a/model_server/app/main.py
+++ b/model_server/app/main.py
@ -216,10 +216,7 @@ async def hallucination(req: HallucinationRequest, res: Response):
@app.post("/v1/chat/completions")
 async def chat_completion(req: ChatMessage, res: Response, request: Request):
    try:
-        prefill_enabled = (
-            request.query_params.get("prefill_enabled", "true").lower() == "true"
-        )
-        result = await arch_function_chat_completion(req, res, prefill_enabled)
+        result = await arch_function_chat_completion(req, res)
        return result
    except Exception as e:
        logger.error(f"Error in chat_completion: {e}")
--- a/model_server/app/tests/test_function_calling.py
+++ b/model_server/app/tests/test_function_calling.py
@ -73,7 +73,7 @@ async def test_chat_completion(mock_hanlder, mock_client):
    mock_hanlder._format_system.return_value = "<formatted_tools>"

    response = Response()
-    chat_response = await chat_completion(request, response, prefill_enabled=True)
+    chat_response = await chat_completion(request, response)

    assert isinstance(chat_response, ChatCompletionResponse)
    assert chat_response.choices[0].message.content is not None