diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 133f9ff8..73c2bde4 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -25,6 +25,11 @@ repos: # --lib is to only test the library, since when integration tests are made, # they will be in a seperate tests directory entry: bash -c "cd arch && cargo test -p intelligent-prompt-gateway --lib" + - id: python-tests + name: Run Python Tests with pytest + language: system + entry: bash -c "cd model_server && pytest --maxfail=5 --disable-warnings" + types: [python] - repo: https://github.com/psf/black rev: 23.1.0 hooks: diff --git a/model_server/app/commons/constants.py b/model_server/app/commons/constants.py index f5747213..67970bf9 100644 --- a/model_server/app/commons/constants.py +++ b/model_server/app/commons/constants.py @@ -28,6 +28,6 @@ arch_guard_model_type = { embedding_model = loader.get_embedding_model() zero_shot_model = loader.get_zero_shot_model() -prompt_guard_dict = loader.get_prompt_guard(arch_guard_model_type[glb.device]) +prompt_guard_dict = loader.get_prompt_guard(arch_guard_model_type[glb.DEVICE]) arch_guard_handler = ArchGuardHanlder(model_dict=prompt_guard_dict) diff --git a/model_server/app/commons/globals.py b/model_server/app/commons/globals.py index 18c9215c..6c82ede2 100644 --- a/model_server/app/commons/globals.py +++ b/model_server/app/commons/globals.py @@ -2,4 +2,3 @@ import app.commons.utilities as utils DEVICE = utils.get_device() -MODE = utils.get_serving_mode() diff --git a/model_server/app/loader.py b/model_server/app/loader.py index 1baa623f..32996f70 100644 --- a/model_server/app/loader.py +++ b/model_server/app/loader.py @@ -7,6 +7,10 @@ from optimum.onnxruntime import ( ORTModelForSequenceClassification, ) import app.commons.utilities as utils +import torch +from transformers import AutoModelForSequenceClassification, AutoTokenizer +from optimum.intel import OVModelForSequenceClassification + logger = utils.get_model_server_logger() @@ -64,12 +68,9 @@ def get_prompt_guard(model_name): logger.info("Loading Guard Model...") if glb.DEVICE == "cpu": - from optimum.intel import OVModelForSequenceClassification model_class = OVModelForSequenceClassification - elif glb.DEVICE == "gpu": - import torch - from transformers import AutoModelForSequenceClassification + else: model_class = AutoModelForSequenceClassification diff --git a/model_server/app/prompt_guard/model_handler.py b/model_server/app/prompt_guard/model_handler.py index cdd15827..a200679b 100644 --- a/model_server/app/prompt_guard/model_handler.py +++ b/model_server/app/prompt_guard/model_handler.py @@ -11,7 +11,6 @@ class ArchGuardHanlder: self.model = model_dict["model"] self.tokenizer = model_dict["tokenizer"] self.device = model_dict["device"] - self.hardware_config = model_dict["hardware_config"] self.threshold = threshold diff --git a/model_server/app/tests/test_app.py b/model_server/app/tests/test_app.py new file mode 100644 index 00000000..d20172a6 --- /dev/null +++ b/model_server/app/tests/test_app.py @@ -0,0 +1,91 @@ +import pytest +import httpx +from fastapi.testclient import TestClient +from app.main import app # Assuming your FastAPI app is in main.py + +client = TestClient(app) + +# Unit tests for the health check endpoint +@pytest.mark.asyncio +async def test_healthz(): + response = client.get("/healthz") + assert response.status_code == 200 + assert response.json() == {"status": "ok"} + +# Unit test for the models endpoint +@pytest.mark.asyncio +async def test_models(): + response = client.get("/models") + assert response.status_code == 200 + assert response.json()["object"] == "list" + assert len(response.json()["data"]) > 0 + +# Unit test for embeddings endpoint +@pytest.mark.asyncio +async def test_embedding(): + request_data = { + "input": "Test embedding", + "model": "katanemo/bge-large-en-v1.5" + } + response = client.post("/embeddings", json=request_data) + if request_data["model"] == "katanemo/bge-large-en-v1.5": + assert response.status_code == 200 + assert response.json()["object"] == "list" + assert "data" in response.json() + else: + assert response.status_code == 400 + +# Unit test for the guard endpoint +@pytest.mark.asyncio +async def test_guard(): + request_data = { + "input": "Test for jailbreak and toxicity", + "task": "jailbreak" + } + response = client.post("/guard", json=request_data) + assert response.status_code == 200 + assert "jailbreak_verdict" in response.json() + +# Unit test for the zero-shot endpoint +@pytest.mark.asyncio +async def test_zeroshot(): + request_data = { + "input": "Test input", + "labels": ["label1", "label2"], + "model": "katanemo/bart-large-mnli" + } + response = client.post("/zeroshot", json=request_data) + if request_data["model"] == "katanemo/bart-large-mnli": + assert response.status_code == 200 + assert "predicted_class" in response.json() + else: + assert response.status_code == 400 + +# Unit test for the hallucination endpoint +@pytest.mark.asyncio +async def test_hallucination(): + request_data = { + "prompt": "Test hallucination", + "parameters": {"param1": "value1"}, + "model": "katanemo/bart-large-mnli" + } + response = client.post("/hallucination", json=request_data) + if request_data["model"] == "katanemo/bart-large-mnli": + assert response.status_code == 200 + assert "params_scores" in response.json() + else: + assert response.status_code == 400 + +# Unit test for the chat completion endpoint +@pytest.mark.asyncio +async def test_chat_completion(): + async with httpx.AsyncClient(app=app, base_url="http://test") as client: + request_data = { + "messages": [{"role": "user", "content": "Hello!"}], + "model": "Arch-Function-1.5B", + "tools": [], # Assuming tools is part of the req as per the function + "metadata": {"x-arch-state": "[]"} # Assuming metadata is needed + } + response = await client.post("/v1/chat/completions", json=request_data) + assert response.status_code == 200 + assert "choices" in response.json() diff --git a/model_server/app/tests/test_loaders_cpu.py b/model_server/app/tests/test_loaders_cpu.py new file mode 100644 index 00000000..812979b8 --- /dev/null +++ b/model_server/app/tests/test_loaders_cpu.py @@ -0,0 +1,83 @@ +import os +import pytest +from unittest.mock import patch, MagicMock +import app.commons.globals as glb +from app.loader import get_embedding_model, get_zero_shot_model, get_prompt_guard + +# Mock constants +glb.DEVICE = "cpu" # Adjust as needed for your test case +arch_guard_model_type = { + "cpu": "katanemo/Arch-Guard-cpu", + "cuda": "katanemo/Arch-Guard", + "mps": "katanemo/Arch-Guard", +} +@pytest.fixture +def mock_env(): + # Mock environment variables + os.environ["MODELS"] = "katanemo/bge-large-en-v1.5" + os.environ["ZERO_SHOT_MODELS"] = "katanemo/bart-large-mnli" + +# Test for get_embedding_model function +@patch("app.loader.ORTModelForFeatureExtraction.from_pretrained") +@patch("app.loader.AutoModel.from_pretrained") +@patch("app.loader.AutoTokenizer.from_pretrained") +def test_get_embedding_model(mock_tokenizer, mock_automodel, mock_ort_model, mock_env): + mock_automodel.return_value = MagicMock() + mock_ort_model.return_value = MagicMock() + mock_tokenizer.return_value = MagicMock() + + embedding_model = get_embedding_model() + + # Assertions + assert embedding_model["model_name"] == "katanemo/bge-large-en-v1.5" + assert mock_tokenizer.called_once_with("katanemo/bge-large-en-v1.5", trust_remote_code=True) + if glb.DEVICE != "cuda": + assert mock_ort_model.called_once_with("katanemo/bge-large-en-v1.5", file_name="onnx/model.onnx") + else: + assert mock_automodel.called_once_with("katanemo/bge-large-en-v1.5", device_map=glb.DEVICE) + +# Test for get_zero_shot_model function +@patch("app.loader.ORTModelForSequenceClassification.from_pretrained") +@patch("app.loader.pipeline") +@patch("app.loader.AutoTokenizer.from_pretrained") +def test_get_zero_shot_model(mock_tokenizer, mock_pipeline, mock_ort_model, mock_env): + mock_pipeline.return_value = MagicMock() + mock_ort_model.return_value = MagicMock() + mock_tokenizer.return_value = MagicMock() + + zero_shot_model = get_zero_shot_model() + + # Assertions + assert zero_shot_model["model_name"] == "katanemo/bart-large-mnli" + assert mock_tokenizer.called_once_with("katanemo/bart-large-mnli") + if glb.DEVICE != "cuda": + assert mock_ort_model.called_once_with("katanemo/bart-large-mnli", file_name="onnx/model.onnx") + else: + assert mock_pipeline.called_once() + +# Test for get_prompt_guard function +@patch("app.loader.AutoTokenizer.from_pretrained") +@patch("app.loader.OVModelForSequenceClassification.from_pretrained") +@patch("app.loader.AutoModelForSequenceClassification.from_pretrained") +def test_get_prompt_guard(mock_ov_model, mock_auto_model, mock_tokenizer): + # Mock model based on device + if glb.DEVICE == "cpu": + mock_ov_model.return_value = MagicMock() + else: + mock_auto_model.return_value = MagicMock() + + mock_tokenizer.return_value = MagicMock() + + prompt_guard = get_prompt_guard(arch_guard_model_type[glb.DEVICE]) + + # Assertions + assert prompt_guard["model_name"] == arch_guard_model_type[glb.DEVICE] + assert mock_tokenizer.called_once_with(arch_guard_model_type[glb.DEVICE], trust_remote_code=True) + if glb.DEVICE == "cpu": + assert mock_ov_model.called_once_with( + arch_guard_model_type[glb.DEVICE], device_map=glb.DEVICE, low_cpu_mem_usage=True + ) + else: + assert mock_auto_model.called_once_with( + arch_guard_model_type[glb.DEVICE], device_map=glb.DEVICE, low_cpu_mem_usage=True + ) diff --git a/model_server/app/tests/test_loaders_gpu.py b/model_server/app/tests/test_loaders_gpu.py new file mode 100644 index 00000000..b9f0edbb --- /dev/null +++ b/model_server/app/tests/test_loaders_gpu.py @@ -0,0 +1,83 @@ +import os +import pytest +from unittest.mock import patch, MagicMock +import app.commons.globals as glb +from app.loader import get_embedding_model, get_zero_shot_model, get_prompt_guard + +# Mock constants +glb.DEVICE = "cuda" # Adjust as needed for your test case +arch_guard_model_type = { + "cpu": "katanemo/Arch-Guard-cpu", + "cuda": "katanemo/Arch-Guard", + "mps": "katanemo/Arch-Guard", +} +@pytest.fixture +def mock_env(): + # Mock environment variables + os.environ["MODELS"] = "katanemo/bge-large-en-v1.5" + os.environ["ZERO_SHOT_MODELS"] = "katanemo/bart-large-mnli" + +# Test for get_embedding_model function +@patch("app.loader.ORTModelForFeatureExtraction.from_pretrained") +@patch("app.loader.AutoModel.from_pretrained") +@patch("app.loader.AutoTokenizer.from_pretrained") +def test_get_embedding_model(mock_tokenizer, mock_automodel, mock_ort_model, mock_env): + mock_automodel.return_value = MagicMock() + mock_ort_model.return_value = MagicMock() + mock_tokenizer.return_value = MagicMock() + + embedding_model = get_embedding_model() + + # Assertions + assert embedding_model["model_name"] == "katanemo/bge-large-en-v1.5" + assert mock_tokenizer.called_once_with("katanemo/bge-large-en-v1.5", trust_remote_code=True) + if glb.DEVICE != "cuda": + assert mock_ort_model.called_once_with("katanemo/bge-large-en-v1.5", file_name="onnx/model.onnx") + else: + assert mock_automodel.called_once_with("katanemo/bge-large-en-v1.5", device_map=glb.DEVICE) + +# Test for get_zero_shot_model function +@patch("app.loader.ORTModelForSequenceClassification.from_pretrained") +@patch("app.loader.pipeline") +@patch("app.loader.AutoTokenizer.from_pretrained") +def test_get_zero_shot_model(mock_tokenizer, mock_pipeline, mock_ort_model, mock_env): + mock_pipeline.return_value = MagicMock() + mock_ort_model.return_value = MagicMock() + mock_tokenizer.return_value = MagicMock() + + zero_shot_model = get_zero_shot_model() + + # Assertions + assert zero_shot_model["model_name"] == "katanemo/bart-large-mnli" + assert mock_tokenizer.called_once_with("katanemo/bart-large-mnli") + if glb.DEVICE != "cuda": + assert mock_ort_model.called_once_with("katanemo/bart-large-mnli", file_name="onnx/model.onnx") + else: + assert mock_pipeline.called_once() + +# Test for get_prompt_guard function +@patch("app.loader.AutoTokenizer.from_pretrained") +@patch("app.loader.OVModelForSequenceClassification.from_pretrained") +@patch("app.loader.AutoModelForSequenceClassification.from_pretrained") +def test_get_prompt_guard(mock_ov_model, mock_auto_model, mock_tokenizer): + # Mock model based on device + if glb.DEVICE == "cpu": + mock_ov_model.return_value = MagicMock() + else: + mock_auto_model.return_value = MagicMock() + + mock_tokenizer.return_value = MagicMock() + + prompt_guard = get_prompt_guard(arch_guard_model_type[glb.DEVICE]) + + # Assertions + assert prompt_guard["model_name"] == arch_guard_model_type[glb.DEVICE] + assert mock_tokenizer.called_once_with(arch_guard_model_type[glb.DEVICE], trust_remote_code=True) + if glb.DEVICE == "cpu": + assert mock_ov_model.called_once_with( + arch_guard_model_type[glb.DEVICE], device_map=glb.DEVICE, low_cpu_mem_usage=True + ) + else: + assert mock_auto_model.called_once_with( + arch_guard_model_type[glb.DEVICE], device_map=glb.DEVICE, low_cpu_mem_usage=True + ) diff --git a/model_server/app/tests/test_loaders_mps.py b/model_server/app/tests/test_loaders_mps.py new file mode 100644 index 00000000..d0e2db12 --- /dev/null +++ b/model_server/app/tests/test_loaders_mps.py @@ -0,0 +1,83 @@ +import os +import pytest +from unittest.mock import patch, MagicMock +import app.commons.globals as glb +from app.loader import get_embedding_model, get_zero_shot_model, get_prompt_guard + +# Mock constants +glb.DEVICE = "mps" # Adjust as needed for your test case +arch_guard_model_type = { + "cpu": "katanemo/Arch-Guard-cpu", + "cuda": "katanemo/Arch-Guard", + "mps": "katanemo/Arch-Guard", +} +@pytest.fixture +def mock_env(): + # Mock environment variables + os.environ["MODELS"] = "katanemo/bge-large-en-v1.5" + os.environ["ZERO_SHOT_MODELS"] = "katanemo/bart-large-mnli" + +# Test for get_embedding_model function +@patch("app.loader.ORTModelForFeatureExtraction.from_pretrained") +@patch("app.loader.AutoModel.from_pretrained") +@patch("app.loader.AutoTokenizer.from_pretrained") +def test_get_embedding_model(mock_tokenizer, mock_automodel, mock_ort_model, mock_env): + mock_automodel.return_value = MagicMock() + mock_ort_model.return_value = MagicMock() + mock_tokenizer.return_value = MagicMock() + + embedding_model = get_embedding_model() + + # Assertions + assert embedding_model["model_name"] == "katanemo/bge-large-en-v1.5" + assert mock_tokenizer.called_once_with("katanemo/bge-large-en-v1.5", trust_remote_code=True) + if glb.DEVICE != "cuda": + assert mock_ort_model.called_once_with("katanemo/bge-large-en-v1.5", file_name="onnx/model.onnx") + else: + assert mock_automodel.called_once_with("katanemo/bge-large-en-v1.5", device_map=glb.DEVICE) + +# Test for get_zero_shot_model function +@patch("app.loader.ORTModelForSequenceClassification.from_pretrained") +@patch("app.loader.pipeline") +@patch("app.loader.AutoTokenizer.from_pretrained") +def test_get_zero_shot_model(mock_tokenizer, mock_pipeline, mock_ort_model, mock_env): + mock_pipeline.return_value = MagicMock() + mock_ort_model.return_value = MagicMock() + mock_tokenizer.return_value = MagicMock() + + zero_shot_model = get_zero_shot_model() + + # Assertions + assert zero_shot_model["model_name"] == "katanemo/bart-large-mnli" + assert mock_tokenizer.called_once_with("katanemo/bart-large-mnli") + if glb.DEVICE != "cuda": + assert mock_ort_model.called_once_with("katanemo/bart-large-mnli", file_name="onnx/model.onnx") + else: + assert mock_pipeline.called_once() + +# Test for get_prompt_guard function +@patch("app.loader.AutoTokenizer.from_pretrained") +@patch("app.loader.OVModelForSequenceClassification.from_pretrained") +@patch("app.loader.AutoModelForSequenceClassification.from_pretrained") +def test_get_prompt_guard(mock_ov_model, mock_auto_model, mock_tokenizer): + # Mock model based on device + if glb.DEVICE == "cpu": + mock_ov_model.return_value = MagicMock() + else: + mock_auto_model.return_value = MagicMock() + + mock_tokenizer.return_value = MagicMock() + + prompt_guard = get_prompt_guard(arch_guard_model_type[glb.DEVICE]) + + # Assertions + assert prompt_guard["model_name"] == arch_guard_model_type[glb.DEVICE] + assert mock_tokenizer.called_once_with(arch_guard_model_type[glb.DEVICE], trust_remote_code=True) + if glb.DEVICE == "cpu": + assert mock_ov_model.called_once_with( + arch_guard_model_type[glb.DEVICE], device_map=glb.DEVICE, low_cpu_mem_usage=True + ) + else: + assert mock_auto_model.called_once_with( + arch_guard_model_type[glb.DEVICE], device_map=glb.DEVICE, low_cpu_mem_usage=True + )