add e2e test

2026-06-17 15:25:17 +02:00 · 2024-11-05 08:42:57 -08:00 · 2024-11-05 08:42:57 -08:00 · e74a3e1e38
commit e74a3e1e38
parent 0d9cbdebda
5 changed files with 61 additions and 4 deletions
--- a/model_server/app/function_calling/model_utils.py
+++ b/model_server/app/function_calling/model_utils.py
@ -133,6 +133,7 @@ async def chat_completion(
                if hasattr(token.choices[0].delta, "content"):
                    full_response += token.choices[0].delta.content
    else:
+        logger.info("Stream is disabled, not engaging pre-filling")
        full_response = resp.choices[0].message.content

    tool_calls = const.arch_function_hanlder.extract_tool_calls(full_response)
--- a/model_server/app/main.py
+++ b/model_server/app/main.py
@ -6,7 +6,7 @@ import app.prompt_guard.model_utils as guard_utils

 from typing import List, Dict
 from pydantic import BaseModel
-from fastapi import FastAPI, Response, HTTPException
+from fastapi import FastAPI, Response, HTTPException, Request
 from app.function_calling.model_utils import ChatMessage

 from app.commons.constants import embedding_model, zero_shot_model, arch_guard_handler
@ -214,9 +214,12 @@ async def hallucination(req: HallucinationRequest, res: Response):


@app.post("/v1/chat/completions")
-async def chat_completion(req: ChatMessage, res: Response):
+async def chat_completion(req: ChatMessage, res: Response, request: Request):
    try:
-        result = await arch_function_chat_completion(req, res)
+        prefill_enabled = (
+            request.query_params.get("prefill_enabled", "true").lower() == "true"
+        )
+        result = await arch_function_chat_completion(req, res, prefill_enabled)
        return result
    except Exception as e:
        logger.error(f"Error in chat_completion: {e}")