Some fixes on model server (#362)

* Some fixes on model server * Remove prompt_prefilling message * Fix logging * Fix poetry issues * Improve logging and update the support for text truncation * Fix tests * Fix tests * Fix tests * Fix modelserver tests * Update modelserver tests
2026-04-26 17:26:26 +02:00 · 2025-01-10 16:45:36 -08:00 · 2025-01-10 16:45:36 -08:00 · 88a02dc478
commit 88a02dc478
parent ebda682b30
25 changed files with 1090 additions and 1666 deletions
--- a/model_server/tests/core/test_function_calling.py
+++ b/model_server/tests/core/test_function_calling.py
@ -1,12 +1,8 @@
-import os
+import pytest

 from src.commons.globals import handler_map
-from src.core.model_utils import ChatMessage, Message
-import pytest
-from fastapi.testclient import TestClient
-from unittest.mock import AsyncMock, patch
-from src.main import app
-from src.commons.globals import handler_map
+from src.core.utils.model_utils import ChatMessage, Message
+

 # define function
 get_weather_api = {
@ -163,7 +159,10 @@ async def test_function_calling(get_data_func):
        function_calling_response = await handler_map["Arch-Function"].chat_completion(
            req
        )
-        assert handler_map["Arch-Function"].hallu_handler.hallucination == hallucination
+        assert (
+            handler_map["Arch-Function"].hallucination_state.hallucination
+            == hallucination
+        )
        response_txt = function_calling_response.choices[0].message.content

        if parameter_gathering:
--- a/model_server/tests/core/test_guardrails.py
+++ b/model_server/tests/core/test_guardrails.py
@ -1,33 +1,6 @@
 from unittest.mock import patch, MagicMock
 from src.core.guardrails import get_guardrail_handler

-# Mock constants
-arch_guard_model_type = {
-    "cpu": "katanemo/Arch-Guard-cpu",
-    "cuda": "katanemo/Arch-Guard",
-    "mps": "katanemo/Arch-Guard",
-}
-
-
-# [TODO] Review: check the following code to test under `cpu`, `cuda`, and `mps`
-# Test for `get_guardrail_handler()` function on `cpu`
-@patch("src.core.guardrails.AutoTokenizer.from_pretrained")
-@patch("src.core.guardrails.AutoModelForSequenceClassification.from_pretrained")
-def test_guardrail_handler_on_cpu(mock_auto_model, mock_tokenizer):
-    device = "cpu"
-
-    mock_tokenizer.return_value = MagicMock()
-
-    guardrail = get_guardrail_handler(device=device)
-
-    mock_tokenizer.assert_called_once_with(guardrail.model_name, trust_remote_code=True)
-
-    mock_auto_model.assert_called_once_with(
-        guardrail.model_name,
-        device_map=device,
-        low_cpu_mem_usage=True,
-    )
-

 # Test for `get_guardrail_handler()` function on `cuda`
@patch("src.core.guardrails.AutoTokenizer.from_pretrained")