Some fixes on model server (#362)

* Some fixes on model server

* Remove prompt_prefilling message

* Fix logging

* Fix poetry issues

* Improve logging and update the support for text truncation

* Fix tests

* Fix tests

* Fix tests

* Fix modelserver tests

* Update modelserver tests
This commit is contained in:
Shuguang Chen 2025-01-10 16:45:36 -08:00 committed by GitHub
parent ebda682b30
commit 88a02dc478
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
25 changed files with 1090 additions and 1666 deletions

View file

@ -1,12 +1,8 @@
import os
import pytest
from src.commons.globals import handler_map
from src.core.model_utils import ChatMessage, Message
import pytest
from fastapi.testclient import TestClient
from unittest.mock import AsyncMock, patch
from src.main import app
from src.commons.globals import handler_map
from src.core.utils.model_utils import ChatMessage, Message
# define function
get_weather_api = {
@ -163,7 +159,10 @@ async def test_function_calling(get_data_func):
function_calling_response = await handler_map["Arch-Function"].chat_completion(
req
)
assert handler_map["Arch-Function"].hallu_handler.hallucination == hallucination
assert (
handler_map["Arch-Function"].hallucination_state.hallucination
== hallucination
)
response_txt = function_calling_response.choices[0].message.content
if parameter_gathering:

View file

@ -1,33 +1,6 @@
from unittest.mock import patch, MagicMock
from src.core.guardrails import get_guardrail_handler
# Mock constants
arch_guard_model_type = {
"cpu": "katanemo/Arch-Guard-cpu",
"cuda": "katanemo/Arch-Guard",
"mps": "katanemo/Arch-Guard",
}
# [TODO] Review: check the following code to test under `cpu`, `cuda`, and `mps`
# Test for `get_guardrail_handler()` function on `cpu`
@patch("src.core.guardrails.AutoTokenizer.from_pretrained")
@patch("src.core.guardrails.AutoModelForSequenceClassification.from_pretrained")
def test_guardrail_handler_on_cpu(mock_auto_model, mock_tokenizer):
device = "cpu"
mock_tokenizer.return_value = MagicMock()
guardrail = get_guardrail_handler(device=device)
mock_tokenizer.assert_called_once_with(guardrail.model_name, trust_remote_code=True)
mock_auto_model.assert_called_once_with(
guardrail.model_name,
device_map=device,
low_cpu_mem_usage=True,
)
# Test for `get_guardrail_handler()` function on `cuda`
@patch("src.core.guardrails.AutoTokenizer.from_pretrained")