plano/model_server/app/load_models.py

94 lines
2.8 KiB
Python
Raw Normal View History

2024-07-18 14:04:51 -07:00
import os
import sentence_transformers
from transformers import AutoTokenizer, AutoModel, pipeline
import sqlite3
import torch
from optimum.onnxruntime import ORTModelForFeatureExtraction, ORTModelForSequenceClassification # type: ignore
2024-10-07 15:21:05 -07:00
def get_device():
if torch.cuda.is_available():
device = "cuda"
elif torch.backends.mps.is_available():
device = "mps"
else:
device = "cpu"
2024-07-18 14:04:51 -07:00
print(f"Devices Avialble: {device}")
return device
2024-10-07 15:21:05 -07:00
def load_transformers(model_name=os.getenv("MODELS", "katanemo/bge-large-en-v1.5")):
print("Loading Embedding Model")
2024-07-18 14:04:51 -07:00
transformers = {}
device = get_device()
transformers["tokenizer"] = AutoTokenizer.from_pretrained(model_name)
if device != "cuda":
transformers["model"] = ORTModelForFeatureExtraction.from_pretrained(
model_name, file_name="onnx/model.onnx"
)
else:
transformers["model"] = AutoModel.from_pretrained(model_name, device_map=device)
transformers["model_name"] = model_name
2024-07-18 14:04:51 -07:00
return transformers
2024-10-07 15:21:05 -07:00
def load_guard_model(
model_name,
hardware_config="cpu",
):
print("Loading Guard Model")
guard_model = {}
guard_model["tokenizer"] = AutoTokenizer.from_pretrained(
model_name, trust_remote_code=True
)
guard_model["model_name"] = model_name
if hardware_config == "cpu":
from optimum.intel import OVModelForSequenceClassification
device = "cpu"
guard_model["model"] = OVModelForSequenceClassification.from_pretrained(
model_name, device_map=device, low_cpu_mem_usage=True
)
elif hardware_config == "gpu":
from transformers import AutoModelForSequenceClassification
import torch
device = "cuda" if torch.cuda.is_available() else "cpu"
guard_model["model"] = AutoModelForSequenceClassification.from_pretrained(
model_name, device_map=device, low_cpu_mem_usage=True
)
guard_model["device"] = device
guard_model["hardware_config"] = hardware_config
return guard_model
def load_zero_shot_models(
model_name=os.getenv("ZERO_SHOT_MODELS", "katanemo/deberta-base-nli")
):
zero_shot_model = {}
device = get_device()
if device != "cuda":
zero_shot_model["model"] = ORTModelForSequenceClassification.from_pretrained(
model_name, file_name="onnx/model.onnx"
)
else:
zero_shot_model["model"] = AutoModel.from_pretrained(model_name)
zero_shot_model["tokenizer"] = AutoTokenizer.from_pretrained(model_name)
# create pipeline
zero_shot_model["pipeline"] = pipeline(
"zero-shot-classification",
model=zero_shot_model["model"],
tokenizer=zero_shot_model["tokenizer"],
device=device,
)
zero_shot_model["model_name"] = model_name
return zero_shot_model
2024-10-07 15:21:05 -07:00
if __name__ == "__main__":
print(get_device())